In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import entropy
In [2]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[2]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [3]:
# Showing the pairings of countries based on total population (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Switzerland = df[(df.location == "Switzerland")]

df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]

df_Bulgaria = df[(df.location == "Bulgaria")]
df_Serbia = df[(df.location == "Serbia")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Czechia = df[(df.location == "Czechia")]
df_Romania = df[(df.location == "Romania")]

df_Denmark = df[(df.location == "Denmark")]
df_Ireland = df[(df.location == "Ireland")]

df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]

df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]

df_France = df[(df.location == "France")]
df_Italy = df[(df.location == "Italy")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]

df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
In [4]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [5]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [6]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[6]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [7]:
country1 = 'Austria'
country2 = 'Switzerland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [8]:
df_firstCountryPairing
Out[8]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322149

2078 rows × 10 columns

In [9]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [10]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [11]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[11]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [12]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [13]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [14]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [15]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [16]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[16]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [17]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [18]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [19]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [20]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989966036956606
In [21]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [22]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002337643207376714
R2 Score: 0.9987160137717644
RMSE: 0.048349
Entropy Value: 0.0003719633457117023
In [23]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[23]:
feature importance
1 diabetes_prevalence 0.943736
6 median_age 0.039143
2 female_smokers 0.012458
3 male_smokers 0.001942
0 cardiovasc_death_rate 0.001906
5 aged_65_older 0.000619
4 life_expectancy 0.000196
In [24]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[24]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [25]:
country1 = 'Austria'
country2 = 'Switzerland'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [26]:
df_updated
Out[26]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
1 Austria 2/26/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
2 Austria 2/27/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
3 Austria 2/28/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
4 Austria 2/29/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.53 0.955 0.03 57410.166 214.243 0.322922
14645 Switzerland 12/26/2022 4.53 0.955 0.03 57410.166 214.243 0.322922
14646 Switzerland 12/27/2022 4.53 0.955 0.03 57410.166 214.243 0.322922
14647 Switzerland 12/28/2022 4.53 0.955 0.03 57410.166 214.243 0.323082
14648 Switzerland 12/29/2022 4.53 0.955 0.03 57410.166 214.243 0.322149

2078 rows × 8 columns

In [27]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [28]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [32]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[32]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [33]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [34]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [35]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [36]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [37]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[37]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [38]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [39]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [40]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [41]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989300549094547
In [42]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [43]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015429193968664893
R2 Score: 0.9991525279603822
RMSE: 0.039280
Entropy Value: 0.0003132008576474396
In [44]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[44]:
feature importance
1 human_development_index 0.974799
2 extreme_poverty 0.022650
3 gdp_per_capita 0.001858
0 hospital_beds_per_thousand 0.000376
4 population_density 0.000317
In [45]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[45]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [46]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [47]:
df_firstCountryPairing
Out[47]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2132 rows × 10 columns

In [48]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [49]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [50]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[50]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [51]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [52]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [53]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [54]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [55]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[55]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [56]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [57]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [58]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [59]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9969839146584258
In [60]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [61]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.041054134347689675
R2 Score: 0.996895600824653
RMSE: 0.202618
Entropy Value: 0.0008300099316689873
In [62]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[62]:
feature importance
1 diabetes_prevalence 0.902614
0 cardiovasc_death_rate 0.058815
2 female_smokers 0.024297
5 aged_65_older 0.006879
6 median_age 0.005122
3 male_smokers 0.001783
4 life_expectancy 0.000490
In [63]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[63]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [64]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [65]:
df_updated
Out[65]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.50 0.929 0.5 44017.591 4.037 1.092509
15717 Canada 12/26/2022 2.50 0.929 0.5 44017.591 4.037 1.092338
15718 Canada 12/27/2022 2.50 0.929 0.5 44017.591 4.037 1.092196
15719 Canada 12/28/2022 2.50 0.929 0.5 44017.591 4.037 1.092321
15720 Canada 12/29/2022 2.50 0.929 0.5 44017.591 4.037 1.093162

2132 rows × 8 columns

In [66]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [67]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [68]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[68]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [69]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [70]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [71]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [72]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [73]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[73]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [74]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [75]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [76]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [77]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9974349402891136
In [78]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [79]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03575065581898076
R2 Score: 0.9972966350842372
RMSE: 0.189078
Entropy Value: 0.0008915270369744506
In [80]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[80]:
feature importance
1 human_development_index 0.944229
0 hospital_beds_per_thousand 0.030624
2 extreme_poverty 0.022911
3 gdp_per_capita 0.001623
4 population_density 0.000613
In [81]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[81]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [82]:
country1 = 'Bulgaria'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [83]:
df_firstCountryPairing
Out[83]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716205

2065 rows × 10 columns

In [84]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [85]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [86]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[86]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [87]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [88]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [89]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [90]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [91]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[91]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [92]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [93]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [94]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [95]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9583831913274017
In [96]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [97]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005568712809238791
R2 Score: 0.99727493711111
RMSE: 0.074624
Entropy Value: 0.000639391380225498
In [98]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[98]:
feature importance
0 cardiovasc_death_rate 0.809015
5 aged_65_older 0.109960
2 female_smokers 0.032594
3 male_smokers 0.025727
1 diabetes_prevalence 0.016079
4 life_expectancy 0.005342
6 median_age 0.001282
In [99]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[99]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [100]:
country1 = 'Bulgaria'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [101]:
df_updated
Out[101]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.50 18563.307 65.180 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.50 18563.307 65.180 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.50 18563.307 65.180 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.50 18563.307 65.180 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.50 18563.307 65.180 14.285714
... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 14048.881 80.291 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 14048.881 80.291 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 14048.881 80.291 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 14048.881 80.291 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 14048.881 80.291 0.716205

2065 rows × 8 columns

In [102]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [103]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [104]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[104]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [105]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [106]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [107]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [108]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [109]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[109]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [110]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [111]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [112]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [113]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9565151571739534
In [114]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [115]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006760468873536101
R2 Score: 0.9966917484399258
RMSE: 0.082222
Entropy Value: 0.000440139151045257
In [116]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[116]:
feature importance
0 hospital_beds_per_thousand 0.764675
1 human_development_index 0.156947
2 extreme_poverty 0.037875
3 gdp_per_capita 0.026132
4 population_density 0.014372
In [117]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[117]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [118]:
country1 = 'Cyprus'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [119]:
df_firstCountryPairing
Out[119]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872

2068 rows × 10 columns

In [120]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [121]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [122]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[122]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [123]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [124]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [125]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [126]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [127]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[127]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [128]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [129]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [130]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [131]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9919358488759082
In [132]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [133]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0010130224723005955
R2 Score: 0.997337568618319
RMSE: 0.031828
Entropy Value: 0.00045829706608428716
In [134]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[134]:
feature importance
1 diabetes_prevalence 0.795629
0 cardiovasc_death_rate 0.146630
2 female_smokers 0.035997
5 aged_65_older 0.010106
3 male_smokers 0.005179
6 median_age 0.003941
4 life_expectancy 0.002516
In [135]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[135]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [136]:
country1 = 'Cyprus'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [137]:
df_updated
Out[137]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.20 94277.965 231.447 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.20 94277.965 231.447 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.20 94277.965 231.447 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.20 94277.965 231.447 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.20 94277.965 231.447 0.377872

2068 rows × 8 columns

In [138]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [139]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [140]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[140]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [141]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [142]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [143]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [144]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [145]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[145]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [146]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [147]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [148]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [149]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9884045961442933
In [150]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [151]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0008681898404388339
R2 Score: 0.9977182185591683
RMSE: 0.029465
Entropy Value: 0.00034460747180680264
In [152]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[152]:
feature importance
1 human_development_index 0.906843
2 extreme_poverty 0.047073
0 hospital_beds_per_thousand 0.033660
3 gdp_per_capita 0.009333
4 population_density 0.003090
In [153]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[153]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [154]:
country1 = 'Czechia'
country2 = 'Romania'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [155]:
df_firstCountryPairing
Out[155]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403

2072 rows × 10 columns

In [156]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [157]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [158]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[158]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [159]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [160]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [161]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [162]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [163]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[163]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [164]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [165]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [166]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [167]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9954484396430019
In [168]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [169]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00747804703817538
R2 Score: 0.9948675599333417
RMSE: 0.086476
Entropy Value: 0.0006442605230367055
In [170]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[170]:
feature importance
0 cardiovasc_death_rate 0.835149
1 diabetes_prevalence 0.111452
2 female_smokers 0.020746
6 median_age 0.020522
5 aged_65_older 0.008818
3 male_smokers 0.002733
4 life_expectancy 0.000580
In [171]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[171]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [172]:
country1 = 'Czechia'
country2 = 'Romania'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [173]:
df_updated
Out[173]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
4153 Czechia 3/1/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
4154 Czechia 3/2/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
4155 Czechia 3/3/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
4156 Czechia 3/4/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
4157 Czechia 3/5/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.7 23313.199 85.129 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.7 23313.199 85.129 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.7 23313.199 85.129 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.7 23313.199 85.129 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.7 23313.199 85.129 2.036403

2072 rows × 8 columns

In [174]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [175]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [176]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[176]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [177]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [178]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [179]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [180]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [181]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[181]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [182]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [183]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [184]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [185]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9968976083567679
In [186]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [187]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006146149886833695
R2 Score: 0.9957816866123151
RMSE: 0.078397
Entropy Value: 0.0005517608195709189
In [188]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[188]:
feature importance
1 human_development_index 0.739105
0 hospital_beds_per_thousand 0.221891
2 extreme_poverty 0.032045
3 gdp_per_capita 0.005941
4 population_density 0.001018
In [189]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[189]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [190]:
country1 = 'Denmark'
country2 = 'Ireland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [191]:
df_firstCountryPairing
Out[191]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388

2097 rows × 10 columns

In [192]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [193]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [194]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[194]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [195]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [196]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [197]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [198]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [199]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[199]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [200]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [201]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [202]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [203]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9979884469227974
In [204]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [205]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005995659189568175
R2 Score: 0.9977638141467456
RMSE: 0.077432
Entropy Value: 0.0012403023391896217
In [206]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[206]:
feature importance
1 diabetes_prevalence 0.826356
0 cardiovasc_death_rate 0.143178
2 female_smokers 0.015810
6 median_age 0.007248
3 male_smokers 0.004191
5 aged_65_older 0.002957
4 life_expectancy 0.000260
In [207]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[207]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [208]:
country1 = 'Denmark'
country2 = 'Ireland'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [209]:
df_updated
Out[209]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
5188 Denmark 2/3/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
5189 Denmark 2/4/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
5190 Denmark 2/5/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
5191 Denmark 2/6/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 67335.293 69.874 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 67335.293 69.874 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 67335.293 69.874 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 67335.293 69.874 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 67335.293 69.874 0.491388

2097 rows × 8 columns

In [210]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [211]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [212]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[212]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [213]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [214]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [215]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [216]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [217]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[217]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [218]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [219]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [220]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [221]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9981483428520672
In [222]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [223]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007388200063707959
R2 Score: 0.9972444416967158
RMSE: 0.085955
Entropy Value: 0.00131007217788142
In [224]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[224]:
feature importance
1 human_development_index 0.956110
0 hospital_beds_per_thousand 0.023453
2 extreme_poverty 0.016011
3 gdp_per_capita 0.003906
4 population_density 0.000520
In [225]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[225]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [226]:
country1 = 'Estonia'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [227]:
df_firstCountryPairing
Out[227]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2099 rows × 10 columns

In [228]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [229]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [230]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[230]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [231]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [232]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [233]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [234]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [235]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[235]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [236]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [237]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [238]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [239]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9979817144366736
In [240]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [241]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0018696016902774828
R2 Score: 0.9968998375395624
RMSE: 0.043239
Entropy Value: 0.0007273351883650379
In [242]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[242]:
feature importance
1 diabetes_prevalence 0.954112
2 female_smokers 0.017664
0 cardiovasc_death_rate 0.012118
5 aged_65_older 0.010583
6 median_age 0.004349
3 male_smokers 0.000941
4 life_expectancy 0.000232
In [243]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[243]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [244]:
country1 = 'Estonia'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [245]:
df_updated
Out[245]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
6250 Estonia 1/18/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
6251 Estonia 2/5/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
6252 Estonia 2/6/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
6253 Estonia 2/7/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 25063.846 31.212 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 25063.846 31.212 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 25063.846 31.212 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 25063.846 31.212 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 25063.846 31.212 0.631969

2099 rows × 8 columns

In [246]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [247]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [248]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[248]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [249]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [250]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [251]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [252]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [253]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[253]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [254]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [255]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [256]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [257]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975189433093143
In [258]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [259]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002613839817756454
R2 Score: 0.9956657462802128
RMSE: 0.051126
Entropy Value: 0.0010067130210343615
In [260]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[260]:
feature importance
1 human_development_index 0.960216
2 extreme_poverty 0.023054
0 hospital_beds_per_thousand 0.014476
3 gdp_per_capita 0.001788
4 population_density 0.000466
In [261]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[261]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [262]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [263]:
df_firstCountryPairing
Out[263]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2102 rows × 10 columns

In [264]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [265]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [266]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[266]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [267]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [268]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [269]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [270]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [271]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[271]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [272]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [273]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [274]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [275]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9955159123177418
In [276]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [277]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003560841308279542
R2 Score: 0.9970021670932279
RMSE: 0.059673
Entropy Value: 0.0011802071599462333
In [278]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[278]:
feature importance
1 diabetes_prevalence 0.517316
0 cardiovasc_death_rate 0.432531
6 median_age 0.019175
5 aged_65_older 0.014394
2 female_smokers 0.011943
3 male_smokers 0.003277
4 life_expectancy 0.001363
In [279]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[279]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [280]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [281]:
df_updated
Out[281]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
7310 Finland 1/29/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
7311 Finland 1/30/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
7312 Finland 1/31/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
7313 Finland 2/1/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
7314 Finland 2/2/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 0.11011

2102 rows × 8 columns

In [282]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [283]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [284]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[284]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [285]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [286]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [287]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [288]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [289]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[289]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [290]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [291]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [292]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [293]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9953918346594799
In [294]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [295]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0030042606321511634
R2 Score: 0.9974707462074646
RMSE: 0.054811
Entropy Value: 0.0010085785334707671
In [296]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[296]:
feature importance
1 human_development_index 0.941978
0 hospital_beds_per_thousand 0.035166
2 extreme_poverty 0.014051
3 gdp_per_capita 0.006777
4 population_density 0.002029
In [297]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[297]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [298]:
country1 = 'France'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [299]:
df_firstCountryPairing
Out[299]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8377 France 1/25/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8378 France 1/26/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8379 France 1/27/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8380 France 1/28/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2135 rows × 10 columns

In [300]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [301]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [302]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[302]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [303]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [304]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [305]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [306]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [307]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[307]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [308]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [309]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [310]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [311]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9950840810056313
In [312]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [313]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.07475212300690831
R2 Score: 0.995997908929556
RMSE: 0.273408
Entropy Value: 0.0007613902551123477
In [314]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[314]:
feature importance
1 diabetes_prevalence 0.953673
2 female_smokers 0.024054
0 cardiovasc_death_rate 0.010520
5 aged_65_older 0.004307
3 male_smokers 0.004017
6 median_age 0.002808
4 life_expectancy 0.000621
In [315]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[315]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [316]:
country1 = 'France'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [317]:
df_updated
Out[317]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.00 35220.084 205.859 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.00 35220.084 205.859 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.00 35220.084 205.859 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.00 35220.084 205.859 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.00 35220.084 205.859 0.735109

2135 rows × 8 columns

In [318]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [319]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [320]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[320]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [321]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [322]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [323]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [324]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [325]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[325]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [326]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [327]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [328]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [329]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best CV score: 0.9949728012103314
In [330]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [331]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0891388770292683
R2 Score: 0.9952276686007263
RMSE: 0.298561
Entropy Value: 0.0009498782679573481
In [332]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[332]:
feature importance
1 human_development_index 0.966568
2 extreme_poverty 0.022584
0 hospital_beds_per_thousand 0.006094
3 gdp_per_capita 0.004111
4 population_density 0.000643
In [333]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[333]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [334]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [335]:
df_firstCountryPairing
Out[335]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2100 rows × 10 columns

In [336]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [337]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [338]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[338]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [339]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [340]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [341]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [342]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [343]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[343]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [344]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [345]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [346]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [347]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984656768745841
In [348]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [349]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010402460269562753
R2 Score: 0.9990051222850926
RMSE: 0.101992
Entropy Value: 0.00040629309095891516
In [350]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[350]:
feature importance
1 diabetes_prevalence 0.973529
2 female_smokers 0.023056
3 male_smokers 0.001375
0 cardiovasc_death_rate 0.000910
5 aged_65_older 0.000438
4 life_expectancy 0.000347
6 median_age 0.000345
In [351]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[351]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [352]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [353]:
df_updated
Out[353]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 0.816005

2100 rows × 8 columns

In [354]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [355]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [356]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[356]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [357]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [358]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [359]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [360]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [361]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[361]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [362]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [363]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [364]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [365]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984950385348217
In [366]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [367]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011447944113759524
R2 Score: 0.9989051335756014
RMSE: 0.106995
Entropy Value: 0.0004521099453512048
In [368]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[368]:
feature importance
1 human_development_index 0.974035
2 extreme_poverty 0.023848
3 gdp_per_capita 0.001661
4 population_density 0.000405
0 hospital_beds_per_thousand 0.000052
In [369]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[369]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [370]:
country1 = 'Portugal'
country2 = 'Spain'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [371]:
df_firstCountryPairing
Out[371]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2097 rows × 10 columns

In [372]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [373]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [374]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[374]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [375]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [376]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [377]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [378]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [379]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[379]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [380]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [381]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [382]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [383]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998521297102364
In [384]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [385]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013814747661354383
R2 Score: 0.9974717098839665
RMSE: 0.117536
Entropy Value: 0.0004896303455702192
In [386]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[386]:
feature importance
1 diabetes_prevalence 0.569929
0 cardiovasc_death_rate 0.229154
5 aged_65_older 0.161755
2 female_smokers 0.033937
6 median_age 0.002516
3 male_smokers 0.002434
4 life_expectancy 0.000275
In [387]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[387]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [388]:
country1 = 'Portugal'
country2 = 'Spain'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [389]:
df_updated
Out[389]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 0.855148

2097 rows × 8 columns

In [390]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [391]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [392]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[392]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [393]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [394]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [395]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [396]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [397]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[397]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [398]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [399]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [400]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [401]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9970653272478941
In [402]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [403]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.028816362882560887
R2 Score: 0.9947262065698224
RMSE: 0.169754
Entropy Value: 0.0011094469817051415
In [404]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[404]:
feature importance
1 human_development_index 0.941568
2 extreme_poverty 0.040873
0 hospital_beds_per_thousand 0.012181
3 gdp_per_capita 0.004690
4 population_density 0.000688
In [405]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[405]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [406]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [407]:
df_firstCountryPairing
Out[407]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2091 rows × 10 columns

In [408]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [409]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [410]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[410]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [411]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [412]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [413]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [414]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [415]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[415]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [416]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [417]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [418]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [419]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.997363058447263
In [420]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [421]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004452525636132693
R2 Score: 0.9978154993065472
RMSE: 0.066727
Entropy Value: 0.0005847514751188148
In [422]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[422]:
feature importance
6 median_age 0.864046
1 diabetes_prevalence 0.091568
0 cardiovasc_death_rate 0.032643
5 aged_65_older 0.008213
2 female_smokers 0.002508
3 male_smokers 0.000722
4 life_expectancy 0.000301
In [423]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[423]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [424]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [425]:
df_updated
Out[425]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 0.536669

2091 rows × 8 columns

In [426]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [427]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [428]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[428]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [429]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [430]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [431]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [432]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [433]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[433]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [434]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [435]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [436]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [437]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9972767647444323
In [438]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [439]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007179321254382132
R2 Score: 0.9964776772689534
RMSE: 0.084731
Entropy Value: 0.0007774232059793869
In [440]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[440]:
feature importance
1 human_development_index 0.656420
0 hospital_beds_per_thousand 0.310241
2 extreme_poverty 0.024558
3 gdp_per_capita 0.008294
4 population_density 0.000487
In [441]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
Out[441]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [442]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
In [443]:
df_firstCountryPairing
Out[443]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [444]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
In [445]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
In [446]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
Out[446]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [447]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
In [448]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
In [449]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
In [450]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [451]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[451]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [452]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [453]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [454]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [455]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9639563497859113
In [456]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [457]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.483882915680185
R2 Score: 0.980047429301368
RMSE: 0.695617
Entropy Value: 0.005987538685788008
In [458]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[458]:
feature importance
0 cardiovasc_death_rate 0.770054
1 diabetes_prevalence 0.089675
5 aged_65_older 0.045220
6 median_age 0.031215
2 female_smokers 0.026417
3 male_smokers 0.023605
4 life_expectancy 0.013812
In [459]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[459]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [460]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [461]:
df_updated
Out[461]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 22.222222
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 1.084791

2136 rows × 8 columns

In [462]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [463]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [464]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[464]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [465]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [466]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [467]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [468]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [469]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[469]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [470]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [471]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [472]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [473]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.9549424661572783
In [474]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [475]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.9691062567746922
R2 Score: 0.9600395871063083
RMSE: 0.984432
Entropy Value: 0.008653279765116838
In [476]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[476]:
feature importance
1 human_development_index 0.868170
2 extreme_poverty 0.072435
0 hospital_beds_per_thousand 0.026159
4 population_density 0.021088
3 gdp_per_capita 0.012148
In [5]:
# Country Pair by Pair Analysis relative to cardiovascular death rate
In [6]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[6]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [7]:
# Showing the pairings of countries based on cardiovascular death rate (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]

df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]

df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]

df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]

df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Bulgaria = df[(df.location == "Bulgaria")]
df_Latvia = df[(df.location == "Latvia")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
In [8]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [9]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [10]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[10]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [8]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [9]:
df_updated
Out[9]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2095 Belgium 12/26/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2096 Belgium 12/27/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2097 Belgium 12/28/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2098 Belgium 12/29/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787

2099 rows × 9 columns

In [10]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [11]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [12]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[12]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [13]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [14]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [15]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [16]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [17]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[17]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [18]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [19]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [20]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [21]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9982254364422947
In [22]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [23]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013887676064707104
R2 Score: 0.998827031715265
RMSE: 0.117846
Entropy Value: 0.0007324096000498459
In [24]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[24]:
feature importance
0 diabetes_prevalence 0.946862
5 median_age 0.040970
2 male_smokers 0.004957
3 life_expectancy 0.004735
1 female_smokers 0.002206
4 aged_65_older 0.000270
In [25]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[25]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [26]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [27]:
df_updated
Out[27]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787

2099 rows × 9 columns

In [28]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [29]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [30]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[30]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [31]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [32]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [33]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [34]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [35]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[35]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [36]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [37]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [38]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [39]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9978932925070829
In [40]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [41]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008856177569188209
R2 Score: 0.9992519975722188
RMSE: 0.094107
Entropy Value: 0.000433455898033473
In [42]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[42]:
feature importance
1 human_development_index 0.927909
2 extreme_poverty 0.038889
5 population 0.030571
3 gdp_per_capita 0.002263
4 population_density 0.000344
0 hospital_beds_per_thousand 0.000024
In [43]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[43]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [44]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [45]:
df_updated
Out[45]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2099 rows × 9 columns

In [46]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [47]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [48]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[48]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [49]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [50]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [51]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [52]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [53]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[53]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [54]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [55]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [56]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [57]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983737881892208
In [58]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [59]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0024167910786875983
R2 Score: 0.9992894760777331
RMSE: 0.049161
Entropy Value: 0.0002770358074043776
In [60]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[60]:
feature importance
0 diabetes_prevalence 0.760003
1 female_smokers 0.112288
5 median_age 0.086465
2 male_smokers 0.037642
3 life_expectancy 0.003177
4 aged_65_older 0.000424
In [61]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[61]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [62]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [63]:
df_updated
Out[63]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.093162

2099 rows × 9 columns

In [64]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [65]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [66]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[66]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [67]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [68]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [69]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [70]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [71]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[71]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [72]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [73]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [74]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [75]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9989558784228938
In [76]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [77]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032162020605789846
R2 Score: 0.9990544534349546
RMSE: 0.056712
Entropy Value: 0.000341364428693161
In [78]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[78]:
feature importance
5 population 0.515688
1 human_development_index 0.452145
2 extreme_poverty 0.028238
3 gdp_per_capita 0.003655
4 population_density 0.000192
0 hospital_beds_per_thousand 0.000083
In [79]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[79]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [80]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [81]:
df_updated
Out[81]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
5188 Denmark 2/3/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
5189 Denmark 2/4/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
5190 Denmark 2/5/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
5191 Denmark 2/6/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8372 Finland 12/26/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8373 Finland 12/27/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8374 Finland 12/28/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8375 Finland 12/29/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159

2128 rows × 9 columns

In [82]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [83]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [84]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[84]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [85]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [86]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [87]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [88]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [89]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[89]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [90]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [91]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [92]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [93]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985717047514587
In [94]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [95]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007552765911951941
R2 Score: 0.9957154647674932
RMSE: 0.086907
Entropy Value: 0.0014811041103746996
In [96]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[96]:
feature importance
0 diabetes_prevalence 0.949543
2 male_smokers 0.035863
1 female_smokers 0.008964
3 life_expectancy 0.004204
5 median_age 0.000997
4 aged_65_older 0.000429
In [97]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[97]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [98]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [99]:
df_updated
Out[99]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5188 Denmark 2/3/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5189 Denmark 2/4/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5190 Denmark 2/5/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5191 Denmark 2/6/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2128 rows × 9 columns

In [100]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [101]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [102]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[102]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [103]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [104]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [105]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [106]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [107]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[107]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [108]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [109]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [110]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [111]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989156833724155
In [112]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [113]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008329520038950185
R2 Score: 0.995274827461516
RMSE: 0.091266
Entropy Value: 0.0016508456649447665
In [114]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[114]:
feature importance
1 human_development_index 0.956574
2 extreme_poverty 0.028531
5 population 0.008398
3 gdp_per_capita 0.006081
4 population_density 0.000371
0 hospital_beds_per_thousand 0.000044
In [115]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[115]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [116]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [117]:
df_updated
Out[117]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
8377 France 1/25/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
8378 France 1/26/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
8379 France 1/27/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
8380 France 1/28/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2107 rows × 9 columns

In [118]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [119]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [120]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[120]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [121]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [122]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [123]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [124]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [125]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[125]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [126]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [127]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [128]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [129]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9911158420617596
In [130]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [131]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.12607663768274116
R2 Score: 0.9899744938655387
RMSE: 0.355073
Entropy Value: 0.0030851096784643584
In [132]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[132]:
feature importance
5 median_age 0.534321
1 female_smokers 0.332868
0 diabetes_prevalence 0.117862
2 male_smokers 0.010450
3 life_expectancy 0.003187
4 aged_65_older 0.001313
In [133]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[133]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [134]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [135]:
df_updated
Out[135]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2107 rows × 9 columns

In [136]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [137]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [138]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[138]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [139]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [140]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [141]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [142]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [143]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[143]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [144]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [145]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [146]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [147]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9906321989356222
In [148]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [149]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.1263773231369439
R2 Score: 0.9899505836160186
RMSE: 0.355496
Entropy Value: 0.0032779127594959943
In [150]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[150]:
feature importance
1 human_development_index 0.958053
2 extreme_poverty 0.017445
5 population 0.014344
3 gdp_per_capita 0.005157
0 hospital_beds_per_thousand 0.004252
4 population_density 0.000749
In [151]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[151]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [152]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [153]:
df_updated
Out[153]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2099 rows × 9 columns

In [154]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [155]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [156]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[156]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [157]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [158]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [159]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [160]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [161]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[161]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [162]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [163]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [164]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [165]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983164837805587
In [166]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [167]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00685068236066425
R2 Score: 0.9994372009296628
RMSE: 0.082769
Entropy Value: 0.0003107010743677336
In [168]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[168]:
feature importance
5 median_age 0.517556
0 diabetes_prevalence 0.329949
1 female_smokers 0.142975
2 male_smokers 0.006820
3 life_expectancy 0.002311
4 aged_65_older 0.000389
In [169]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[169]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [170]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [171]:
df_updated
Out[171]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2099 rows × 9 columns

In [172]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [173]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [174]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[174]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [175]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [176]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [177]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [178]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [179]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[179]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [180]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [181]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [182]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [183]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986413470709365
In [184]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [185]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013803653998387413
R2 Score: 0.9988659985635657
RMSE: 0.117489
Entropy Value: 0.0004399732479586434
In [186]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[186]:
feature importance
1 human_development_index 0.937618
2 extreme_poverty 0.027005
5 population 0.023078
0 hospital_beds_per_thousand 0.008608
3 gdp_per_capita 0.002772
4 population_density 0.000918
In [187]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[187]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [188]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [189]:
df_updated
Out[189]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872

2078 rows × 9 columns

In [190]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [191]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [192]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[192]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [193]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [194]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [195]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [196]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [197]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[197]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [198]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [199]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [200]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [201]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984742718532862
In [202]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [203]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008485613991385917
R2 Score: 0.9988940462273057
RMSE: 0.092117
Entropy Value: 0.0013446805914148691
In [204]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[204]:
feature importance
5 median_age 0.729067
0 diabetes_prevalence 0.212295
2 male_smokers 0.034604
1 female_smokers 0.021167
3 life_expectancy 0.002298
4 aged_65_older 0.000569
In [205]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[205]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [206]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [207]:
df_updated
Out[207]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872

2078 rows × 9 columns

In [208]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [209]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [210]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[210]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [211]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [212]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [213]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [214]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [215]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[215]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [216]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [217]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [218]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [219]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985273943931006
In [220]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [221]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007512088981123421
R2 Score: 0.9990209284610492
RMSE: 0.086672
Entropy Value: 0.0009194530556352238
In [222]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[222]:
feature importance
1 human_development_index 0.943109
2 extreme_poverty 0.037435
5 population 0.012103
3 gdp_per_capita 0.006234
0 hospital_beds_per_thousand 0.000694
4 population_density 0.000425
In [223]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[223]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [224]:
country1 = 'Portugal'
country2 = 'Spain'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [225]:
df_updated
Out[225]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2097 rows × 9 columns

In [226]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [227]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [228]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[228]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [229]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [230]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [231]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [232]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [233]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[233]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [234]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [235]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [236]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [237]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974493361773998
In [238]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [239]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.029155220822653562
R2 Score: 0.9946641908745901
RMSE: 0.170749
Entropy Value: 0.001250556072636702
In [240]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[240]:
feature importance
0 diabetes_prevalence 0.865528
1 female_smokers 0.064998
5 median_age 0.038122
2 male_smokers 0.021243
3 life_expectancy 0.009762
4 aged_65_older 0.000348
In [241]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[241]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [242]:
country1 = 'Portugal'
country2 = 'Spain'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [243]:
df_updated
Out[243]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148

2097 rows × 9 columns

In [244]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [245]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [246]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[246]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [247]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [248]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [249]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [250]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [251]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[251]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [252]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [253]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [254]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [255]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9967905002141979
In [256]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [257]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.02264676574464689
R2 Score: 0.9958553282770057
RMSE: 0.150488
Entropy Value: 0.0008599665585553152
In [258]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[258]:
feature importance
1 human_development_index 0.929110
2 extreme_poverty 0.039519
5 population 0.027822
3 gdp_per_capita 0.001728
0 hospital_beds_per_thousand 0.001403
4 population_density 0.000418
In [259]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[259]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [260]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [261]:
df_updated
Out[261]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2102 rows × 9 columns

In [262]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [263]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [264]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[264]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [265]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [266]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [267]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [268]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [269]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[269]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [270]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [271]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [272]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [273]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.996177252695678
In [274]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [275]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.026225471602158636
R2 Score: 0.9949896669761966
RMSE: 0.161943
Entropy Value: 0.0011876312272444455
In [276]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[276]:
feature importance
1 female_smokers 0.812203
0 diabetes_prevalence 0.156773
2 male_smokers 0.018060
5 median_age 0.007554
3 life_expectancy 0.004384
4 aged_65_older 0.001026
In [277]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[277]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [278]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [279]:
df_updated
Out[279]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.816005

2102 rows × 9 columns

In [280]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [281]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [282]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[282]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [283]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [284]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [285]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [286]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [287]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[287]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [288]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [289]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [290]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [291]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9964944545383589
In [292]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [293]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.025985308865571106
R2 Score: 0.9950355496702608
RMSE: 0.161200
Entropy Value: 0.0009864073077925806
In [294]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[294]:
feature importance
1 human_development_index 0.938681
5 population 0.038091
2 extreme_poverty 0.018932
3 gdp_per_capita 0.003068
4 population_density 0.000901
0 hospital_beds_per_thousand 0.000327
In [295]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[295]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [11]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [12]:
df_updated
Out[12]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 9 columns

In [13]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [14]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [15]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[15]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [16]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [17]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [18]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [19]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [20]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[20]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [21]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [22]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [23]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [24]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9638881170030213
In [25]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [26]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.307896296891642
R2 Score: 0.9460698187834822
RMSE: 1.143633
Entropy Value: 0.007993738527703239
In [27]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[27]:
feature importance
0 diabetes_prevalence 0.848283
1 female_smokers 0.041961
5 median_age 0.040427
3 life_expectancy 0.032823
2 male_smokers 0.019508
4 aged_65_older 0.016998
In [28]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[28]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [29]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [30]:
df_updated
Out[30]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 9 columns

In [31]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [32]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [33]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[33]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [34]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [35]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [36]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [37]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [38]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[38]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [39]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [40]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [41]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [42]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9638248976625985
In [43]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [44]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.908396956844949
R2 Score: 0.9625428922647679
RMSE: 0.953099
Entropy Value: 0.007020342279931816
In [45]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[45]:
feature importance
1 human_development_index 0.840735
2 extreme_poverty 0.059561
5 population 0.058444
3 gdp_per_capita 0.032403
4 population_density 0.008650
0 hospital_beds_per_thousand 0.000206
In [331]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[331]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [332]:
country1 = 'Czechia'
country2 = 'Estonia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [333]:
df_updated
Out[333]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4154 Czechia 3/2/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4155 Czechia 3/3/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4156 Czechia 3/4/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4157 Czechia 3/5/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.464100
7306 Estonia 12/26/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.464100
7307 Estonia 12/27/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.463645
7308 Estonia 12/28/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.466423
7309 Estonia 12/29/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.466423

2095 rows × 9 columns

In [334]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [335]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [336]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[336]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [337]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [338]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [339]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [340]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [341]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[341]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [342]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [343]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [344]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [345]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9976974336489285
In [346]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [347]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0014714686404337467
R2 Score: 0.9978634041885316
RMSE: 0.038360
Entropy Value: 0.000335000984342334
In [348]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[348]:
feature importance
1 female_smokers 0.768576
0 diabetes_prevalence 0.169903
2 male_smokers 0.049100
5 median_age 0.009381
3 life_expectancy 0.002434
4 aged_65_older 0.000607
In [349]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[349]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [350]:
country1 = 'Czechia'
country2 = 'Estonia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [351]:
df_updated
Out[351]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423

2095 rows × 9 columns

In [352]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [353]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [354]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[354]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [355]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [356]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [357]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [358]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [359]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[359]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [360]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [361]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [362]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [363]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9968447535278674
In [364]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [365]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0019107375406635276
R2 Score: 0.9972255787761853
RMSE: 0.043712
Entropy Value: 0.0005664584928279815
In [366]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[366]:
feature importance
1 human_development_index 0.927005
2 extreme_poverty 0.053546
3 gdp_per_capita 0.010799
5 population 0.005969
0 hospital_beds_per_thousand 0.002133
4 population_density 0.000548
In [367]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[367]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [368]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [369]:
df_updated
Out[369]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2091 rows × 9 columns

In [370]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [371]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [372]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[372]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [373]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [374]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [375]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [376]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [377]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[377]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [378]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [379]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [380]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [381]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9970316226679301
In [382]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [383]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007095434014830886
R2 Score: 0.9965188340747636
RMSE: 0.084234
Entropy Value: 0.0009978441951363514
In [384]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[384]:
feature importance
1 female_smokers 0.556932
0 diabetes_prevalence 0.407805
2 male_smokers 0.023935
3 life_expectancy 0.006117
5 median_age 0.004886
4 aged_65_older 0.000324
In [385]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[385]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [386]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [387]:
df_updated
Out[387]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2091 rows × 9 columns

In [388]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [389]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [390]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[390]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [391]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [392]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [393]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [394]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [395]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[395]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [396]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [397]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [398]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [399]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9967388889181926
In [400]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [401]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010420033179699867
R2 Score: 0.9948877173166316
RMSE: 0.102079
Entropy Value: 0.0011192503090849918
In [402]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[402]:
feature importance
1 human_development_index 0.891315
5 population 0.082122
2 extreme_poverty 0.022988
3 gdp_per_capita 0.002865
4 population_density 0.000641
0 hospital_beds_per_thousand 0.000069
In [403]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[403]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [404]:
country1 = 'Bulgaria'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [405]:
df_updated
Out[405]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2065 rows × 9 columns

In [406]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [407]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [408]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[408]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [409]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [410]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [411]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [412]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [413]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[413]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [414]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [415]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [416]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [417]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9520281114433315
In [418]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [419]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0034538780402943816
R2 Score: 0.9980017666453213
RMSE: 0.058770
Entropy Value: 0.0005870161018070032
In [420]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[420]:
feature importance
0 diabetes_prevalence 0.784650
5 median_age 0.136987
2 male_smokers 0.036838
3 life_expectancy 0.022305
1 female_smokers 0.013887
4 aged_65_older 0.005333
In [421]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[421]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [422]:
country1 = 'Bulgaria'
country2 = 'Latvia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [423]:
df_updated
Out[423]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631969

2065 rows × 9 columns

In [424]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [425]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [426]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[426]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [427]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [428]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [429]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [430]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [431]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[431]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [432]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [433]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [434]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [435]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9544889135900533
In [436]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [437]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0058068592192322644
R2 Score: 0.9966404546882018
RMSE: 0.076203
Entropy Value: 0.000744361683555891
In [438]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[438]:
feature importance
5 population 0.647422
0 hospital_beds_per_thousand 0.258173
2 extreme_poverty 0.035262
1 human_development_index 0.027633
3 gdp_per_capita 0.025970
4 population_density 0.005540
In [439]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[439]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [440]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [441]:
df_updated
Out[441]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.036403

2076 rows × 9 columns

In [442]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [443]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [444]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[444]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [445]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [446]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [447]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [448]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [449]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[449]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [450]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [451]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [452]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [453]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9965278391641423
In [454]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [455]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002681968001128092
R2 Score: 0.9984385356636368
RMSE: 0.051788
Entropy Value: 0.0006055225602219516
In [456]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[456]:
feature importance
0 diabetes_prevalence 0.756724
5 median_age 0.195600
1 female_smokers 0.036499
2 male_smokers 0.006774
3 life_expectancy 0.003531
4 aged_65_older 0.000872
In [457]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[457]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [458]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [459]:
df_updated
Out[459]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 9 columns

In [460]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [461]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [462]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[462]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [463]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [464]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [465]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [466]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [467]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[467]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [468]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [469]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [470]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [471]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9942151184753699
In [472]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [473]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003107698404524249
R2 Score: 0.9981906718406797
RMSE: 0.055747
Entropy Value: 0.0004506715934095217
In [474]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[474]:
feature importance
5 population 0.753233
1 human_development_index 0.216832
2 extreme_poverty 0.021982
3 gdp_per_capita 0.007136
4 population_density 0.000814
0 hospital_beds_per_thousand 0.000003
In [475]:
# Country Pair by Pair Analysis relative to male smokers
In [476]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[476]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [477]:
# Showing the pairings of countries based on male smokers (13 pairs of countries)
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]

df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]

df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]

df_Belgium = df[(df.location == "Belgium")]
df_Czechia = df[(df.location == "Czechia")]

df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]

df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Spain = df[(df.location == "Spain")]

df_Switzerland = df[(df.location == "Switzerland")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Latvia = df[(df.location == "Latvia")]
In [478]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [479]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [480]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[480]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [481]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [482]:
df_updated
Out[482]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.093162

2134 rows × 9 columns

In [483]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [484]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [485]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[485]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [486]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [487]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [488]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [489]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [490]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[490]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [491]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [492]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [493]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [494]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9985893709741042
In [495]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [496]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0035792870495897117
R2 Score: 0.999145763988884
RMSE: 0.059827
Entropy Value: 0.0004571636325590867
In [497]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[497]:
feature importance
1 diabetes_prevalence 0.742289
0 cardiovasc_death_rate 0.224611
2 female_smokers 0.019414
5 median_age 0.011607
3 life_expectancy 0.001914
4 aged_65_older 0.000164
In [498]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[498]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [499]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [500]:
df_updated
Out[500]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
5188 Denmark 2/3/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
5189 Denmark 2/4/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
5190 Denmark 2/5/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
5191 Denmark 2/6/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.093162

2134 rows × 9 columns

In [501]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [502]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [503]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[503]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [504]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [505]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [506]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [507]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [508]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[508]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [509]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [510]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [511]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [512]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989520477467734
In [513]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [514]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0026540444143001835
R2 Score: 0.9993665832657773
RMSE: 0.051517
Entropy Value: 0.00039769797240609057
In [515]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[515]:
feature importance
1 human_development_index 0.927082
5 population 0.037657
2 extreme_poverty 0.019357
0 hospital_beds_per_thousand 0.014506
3 gdp_per_capita 0.001192
4 population_density 0.000205
In [516]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[516]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [517]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [518]:
df_updated
Out[518]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
7311 Finland 1/30/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
7312 Finland 1/31/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
7313 Finland 2/1/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
7314 Finland 2/2/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011

2102 rows × 9 columns

In [519]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [520]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [521]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[521]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [522]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [523]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [524]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [525]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [526]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[526]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [527]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [528]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [529]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [530]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9964956178544988
In [531]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [532]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032498598374813537
R2 Score: 0.9972639789533597
RMSE: 0.057008
Entropy Value: 0.0008594876046729041
In [533]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[533]:
feature importance
1 diabetes_prevalence 0.520148
0 cardiovasc_death_rate 0.437998
5 median_age 0.025765
2 female_smokers 0.012087
3 life_expectancy 0.002962
4 aged_65_older 0.001041
In [534]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[534]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [535]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [536]:
df_updated
Out[536]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
7310 Finland 1/29/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
7311 Finland 1/30/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
7312 Finland 1/31/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
7313 Finland 2/1/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
7314 Finland 2/2/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2102 rows × 9 columns

In [537]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [538]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [539]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[539]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [540]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [541]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [542]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [543]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [544]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[544]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [545]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [546]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [547]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [548]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9962131899966675
In [549]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [550]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002658355780630809
R2 Score: 0.9977619596754977
RMSE: 0.051559
Entropy Value: 0.0007340555943675478
In [551]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[551]:
feature importance
1 human_development_index 0.899612
5 population 0.064346
0 hospital_beds_per_thousand 0.017225
2 extreme_poverty 0.013530
3 gdp_per_capita 0.003408
4 population_density 0.001880
In [552]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[552]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [553]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [554]:
df_updated
Out[554]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388

2076 rows × 9 columns

In [555]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [556]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [557]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[557]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [558]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [559]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [560]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [561]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [562]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[562]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [563]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [564]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [565]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [566]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9981898039474573
In [567]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [568]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0024182536451926267
R2 Score: 0.9989409896781432
RMSE: 0.049176
Entropy Value: 0.0004535153217103071
In [569]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[569]:
feature importance
5 median_age 0.612473
0 cardiovasc_death_rate 0.345076
2 female_smokers 0.026558
1 diabetes_prevalence 0.013996
3 life_expectancy 0.001551
4 aged_65_older 0.000346
In [570]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[570]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [571]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [572]:
df_updated
Out[572]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388

2076 rows × 9 columns

In [573]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [574]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [575]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[575]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [576]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [577]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [578]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [579]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [580]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[580]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [581]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [582]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [583]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [584]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998351364576265
In [585]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [586]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001882911265242772
R2 Score: 0.9991754287359407
RMSE: 0.043393
Entropy Value: 0.0004605212242426924
In [587]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[587]:
feature importance
5 population 0.523426
1 human_development_index 0.437981
2 extreme_poverty 0.036239
3 gdp_per_capita 0.001743
4 population_density 0.000594
0 hospital_beds_per_thousand 0.000017
In [588]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[588]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [589]:
country1 = 'Netherlands'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [590]:
df_updated
Out[590]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.536669

2099 rows × 9 columns

In [591]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [592]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [593]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[593]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [594]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [595]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [596]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [597]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [598]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[598]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [599]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [600]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [601]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [602]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9990369085942957
In [603]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [604]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004094213219663488
R2 Score: 0.9995002849901965
RMSE: 0.063986
Entropy Value: 0.00025517686621591407
In [605]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[605]:
feature importance
1 diabetes_prevalence 0.932248
0 cardiovasc_death_rate 0.037284
2 female_smokers 0.025795
5 median_age 0.003006
3 life_expectancy 0.001421
4 aged_65_older 0.000247
In [606]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[606]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [607]:
country1 = 'Netherlands'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [608]:
df_updated
Out[608]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2099 rows × 9 columns

In [609]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [610]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [611]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[611]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [612]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [613]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [614]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [615]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [616]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[616]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [617]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [618]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [619]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [620]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989307334211548
In [621]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [622]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00559759095128752
R2 Score: 0.9993167917577755
RMSE: 0.074817
Entropy Value: 0.00043829864165291835
In [623]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[623]:
feature importance
1 human_development_index 0.967464
2 extreme_poverty 0.027111
5 population 0.002733
3 gdp_per_capita 0.001380
0 hospital_beds_per_thousand 0.000901
4 population_density 0.000412
In [624]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[624]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [625]:
country1 = 'Sweden'
country2 = 'United Kingdom'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [626]:
df_updated
Out[626]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.816005

2126 rows × 9 columns

In [627]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [628]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [629]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[629]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [630]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [631]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [632]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [633]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [634]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[634]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [635]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [636]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [637]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [638]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9647620002703616
In [639]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [640]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.19934385811888447
R2 Score: 0.9926835029165212
RMSE: 0.446479
Entropy Value: 0.002995808129382941
In [641]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[641]:
feature importance
0 cardiovasc_death_rate 0.821076
5 median_age 0.086721
1 diabetes_prevalence 0.033801
3 life_expectancy 0.032590
2 female_smokers 0.018086
4 aged_65_older 0.007726
In [642]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[642]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [643]:
country1 = 'Sweden'
country2 = 'United Kingdom'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [644]:
df_updated
Out[644]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.816005

2126 rows × 9 columns

In [645]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [646]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [647]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[647]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [648]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [649]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [650]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [651]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [652]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[652]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [653]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [654]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [655]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [656]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9657040062307898
In [657]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [658]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.1755698880186287
R2 Score: 0.9935560764913614
RMSE: 0.419011
Entropy Value: 0.0030443221086945424
In [659]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[659]:
feature importance
1 human_development_index 0.795197
5 population 0.117897
2 extreme_poverty 0.038008
3 gdp_per_capita 0.030219
4 population_density 0.018509
0 hospital_beds_per_thousand 0.000171
In [660]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[660]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [661]:
country1 = 'United States'
country2 = 'Austria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [662]:
df_updated
Out[662]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.084791

2112 rows × 9 columns

In [663]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [664]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [665]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[665]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [666]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [667]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [668]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [669]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [670]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[670]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [671]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [672]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [673]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [674]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9707123801174072
In [675]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [676]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.015612908849576288
R2 Score: 0.9900116946003525
RMSE: 0.124952
Entropy Value: 0.0008956642056999705
In [677]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[677]:
feature importance
1 diabetes_prevalence 0.884760
0 cardiovasc_death_rate 0.061283
2 female_smokers 0.022979
4 aged_65_older 0.015255
3 life_expectancy 0.008161
5 median_age 0.007562
In [678]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[678]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [679]:
country1 = 'United States'
country2 = 'Austria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [680]:
df_updated
Out[680]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2112 rows × 9 columns

In [681]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [682]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [683]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[683]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [684]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [685]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [686]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [687]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [688]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[688]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [689]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [690]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [691]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [692]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9748898922979944
In [693]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [694]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011356761380877888
R2 Score: 0.9927345504853691
RMSE: 0.106568
Entropy Value: 0.0008026741253104213
In [695]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[695]:
feature importance
1 human_development_index 0.941349
2 extreme_poverty 0.024552
5 population 0.015274
4 population_density 0.010674
3 gdp_per_capita 0.007128
0 hospital_beds_per_thousand 0.001023
In [696]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[696]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [697]:
country1 = 'Belgium'
country2 = 'Czechia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [698]:
df_updated
Out[698]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919575

2094 rows × 9 columns

In [699]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [700]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [701]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[701]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [702]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [703]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [704]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [705]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [706]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[706]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [707]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [708]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [709]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [710]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9981564421259727
In [711]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [712]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.021770315476854027
R2 Score: 0.9980743421157474
RMSE: 0.147548
Entropy Value: 0.0008394924242857518
In [713]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[713]:
feature importance
1 diabetes_prevalence 0.774151
0 cardiovasc_death_rate 0.164452
2 female_smokers 0.036317
5 median_age 0.022970
3 life_expectancy 0.001647
4 aged_65_older 0.000464
In [714]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[714]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [715]:
country1 = 'Belgium'
country2 = 'Czechia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [716]:
df_updated
Out[716]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919575

2094 rows × 9 columns

In [717]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [718]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [719]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[719]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [720]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [721]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [722]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [723]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [724]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[724]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [725]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [726]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [727]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [728]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9981977178578196
In [729]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [730]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.018443397054300942
R2 Score: 0.9983686192794139
RMSE: 0.135806
Entropy Value: 0.000594716015312401
In [731]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[731]:
feature importance
1 human_development_index 0.939053
2 extreme_poverty 0.038798
5 population 0.014107
0 hospital_beds_per_thousand 0.004909
3 gdp_per_capita 0.002483
4 population_density 0.000649
In [732]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[732]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [733]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [734]:
df_updated
Out[734]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411892

2132 rows × 9 columns

In [735]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [736]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [737]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[737]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [738]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [739]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [740]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [741]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [742]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[742]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [743]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [744]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [745]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [746]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.99396874660859
In [747]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [748]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.09053132731245286
R2 Score: 0.9905744145265692
RMSE: 0.300884
Entropy Value: 0.005811101609293169
In [749]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[749]:
feature importance
1 diabetes_prevalence 0.805098
0 cardiovasc_death_rate 0.155346
2 female_smokers 0.018175
5 median_age 0.016554
3 life_expectancy 0.003922
4 aged_65_older 0.000905
In [750]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[750]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [751]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [752]:
df_updated
Out[752]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6250 Estonia 1/18/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6251 Estonia 2/5/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6252 Estonia 2/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6253 Estonia 2/7/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411892

2132 rows × 9 columns

In [753]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [754]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [755]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[755]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [756]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [757]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [758]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [759]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [760]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[760]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [761]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [762]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [763]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [764]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9924204242316919
In [765]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [766]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.09564363505394957
R2 Score: 0.9900421513308949
RMSE: 0.309263
Entropy Value: 0.005800126883421082
In [767]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[767]:
feature importance
1 human_development_index 0.929469
5 population 0.033558
2 extreme_poverty 0.029972
3 gdp_per_capita 0.003907
0 hospital_beds_per_thousand 0.002285
4 population_density 0.000808
In [768]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[768]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [769]:
country1 = 'Italy'
country2 = 'Portugal'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [770]:
df_updated
Out[770]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109

2098 rows × 9 columns

In [771]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [772]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [773]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[773]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [774]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [775]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [776]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [777]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [778]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[778]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [779]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [780]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [781]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [782]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9992491538823527
In [783]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [784]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03916584085137451
R2 Score: 0.9965620865525731
RMSE: 0.197904
Entropy Value: 0.0019471953731532649
In [785]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[785]:
feature importance
0 cardiovasc_death_rate 0.731503
5 median_age 0.189648
1 diabetes_prevalence 0.051121
2 female_smokers 0.025325
3 life_expectancy 0.002138
4 aged_65_older 0.000265
In [786]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[786]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [787]:
country1 = 'Italy'
country2 = 'Portugal'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [788]:
df_updated
Out[788]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2098 rows × 9 columns

In [789]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [790]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [791]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[791]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [792]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [793]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [794]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [795]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [796]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[796]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [797]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [798]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [799]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [800]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9991962488532368
In [801]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [802]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0346698323414571
R2 Score: 0.996956738825574
RMSE: 0.186198
Entropy Value: 0.0012806717112898633
In [803]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[803]:
feature importance
1 human_development_index 0.923655
5 population 0.047005
2 extreme_poverty 0.026004
3 gdp_per_capita 0.003020
4 population_density 0.000296
0 hospital_beds_per_thousand 0.000020
In [804]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[804]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [805]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [806]:
df_updated
Out[806]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.036403

2076 rows × 9 columns

In [807]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [808]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [809]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[809]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [810]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [811]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [812]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [813]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [814]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[814]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [815]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [816]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [817]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [818]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9963303624947721
In [819]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [820]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002100050408732467
R2 Score: 0.9987773329821902
RMSE: 0.045826
Entropy Value: 0.0002089676106423466
In [821]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[821]:
feature importance
0 cardiovasc_death_rate 0.598715
5 median_age 0.242721
1 diabetes_prevalence 0.134221
2 female_smokers 0.020674
3 life_expectancy 0.003062
4 aged_65_older 0.000607
In [822]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[822]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [823]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [824]:
df_updated
Out[824]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 9 columns

In [825]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [826]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [827]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[827]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [828]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [829]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [830]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [831]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [832]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[832]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [833]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [834]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [835]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [836]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9942151184753699
In [837]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [838]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003107698404524249
R2 Score: 0.9981906718406797
RMSE: 0.055747
Entropy Value: 0.0004506715934095217
In [839]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[839]:
feature importance
5 population 0.753233
1 human_development_index 0.216832
2 extreme_poverty 0.021982
3 gdp_per_capita 0.007136
4 population_density 0.000814
0 hospital_beds_per_thousand 0.000003
In [840]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[840]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [841]:
country1 = 'Slovakia'
country2 = 'Spain'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [842]:
df_updated
Out[842]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148

2092 rows × 9 columns

In [843]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [844]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [845]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[845]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [846]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [847]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [848]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [849]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [850]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[850]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [851]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [852]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [853]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [854]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9993652655177874
In [855]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [856]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002584137818108158
R2 Score: 0.9995264678213833
RMSE: 0.050834
Entropy Value: 0.0002068980142316218
In [857]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[857]:
feature importance
5 median_age 0.982145
1 diabetes_prevalence 0.009200
2 female_smokers 0.004551
0 cardiovasc_death_rate 0.003429
3 life_expectancy 0.000589
4 aged_65_older 0.000087
In [858]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[858]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [859]:
country1 = 'Slovakia'
country2 = 'Spain'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [860]:
df_updated
Out[860]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148

2092 rows × 9 columns

In [861]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [862]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [863]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[863]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [864]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [865]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [866]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [867]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [868]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[868]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [869]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [870]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [871]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [872]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975963885456272
In [873]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [874]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008134240357790014
R2 Score: 0.9985094353207385
RMSE: 0.090190
Entropy Value: 0.0006655143831412369
In [875]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[875]:
feature importance
1 human_development_index 8.879057e-01
5 population 6.994735e-02
2 extreme_poverty 4.072583e-02
3 gdp_per_capita 1.082704e-03
4 population_density 3.382074e-04
0 hospital_beds_per_thousand 1.596907e-07
In [876]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[876]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [877]:
country1 = 'Switzerland'
country2 = 'Bulgaria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [878]:
df_updated
Out[878]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.322149

2066 rows × 9 columns

In [879]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [880]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [881]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[881]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [882]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [883]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [884]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [885]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [886]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[886]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [887]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [888]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [889]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [890]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9590497939781765
In [891]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [892]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004896549532936876
R2 Score: 0.9980297240563568
RMSE: 0.069975
Entropy Value: 0.0006950015035866526
In [893]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[893]:
feature importance
0 cardiovasc_death_rate 0.776305
1 diabetes_prevalence 0.101123
5 median_age 0.059598
2 female_smokers 0.031843
3 life_expectancy 0.017355
4 aged_65_older 0.013777
In [894]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[894]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [895]:
country1 = 'Switzerland'
country2 = 'Bulgaria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [896]:
df_updated
Out[896]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.50 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.50 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.322922
14645 Switzerland 12/26/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.322922
14646 Switzerland 12/27/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.322922
14647 Switzerland 12/28/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.323082
14648 Switzerland 12/29/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.322149

2066 rows × 9 columns

In [897]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [898]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [899]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[899]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [900]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [901]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [902]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [903]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [904]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[904]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [905]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [906]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [907]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [908]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.956167269462329
In [909]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [910]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005126255663843806
R2 Score: 0.9979372947935078
RMSE: 0.071598
Entropy Value: 0.0006230810700906027
In [911]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[911]:
feature importance
1 human_development_index 0.595190
5 population 0.208472
0 hospital_beds_per_thousand 0.114103
2 extreme_poverty 0.043787
3 gdp_per_capita 0.025727
4 population_density 0.012720
In [912]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[912]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [913]:
country1 = 'Cyprus'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [914]:
df_updated
Out[914]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631969

2065 rows × 9 columns

In [915]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [916]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [917]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[917]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [918]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [919]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [920]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [921]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [922]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[922]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [923]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [924]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [925]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [926]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9920063450501019
In [927]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [928]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0009323680911373003
R2 Score: 0.9981522797865945
RMSE: 0.030535
Entropy Value: 0.0003241275435360705
In [929]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[929]:
feature importance
1 diabetes_prevalence 0.822942
0 cardiovasc_death_rate 0.117464
2 female_smokers 0.032022
5 median_age 0.022714
3 life_expectancy 0.002734
4 aged_65_older 0.002124
In [930]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[930]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [931]:
country1 = 'Cyprus'
country2 = 'Latvia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [932]:
df_updated
Out[932]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631969

2065 rows × 9 columns

In [933]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [934]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [935]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[935]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [936]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [937]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [938]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [939]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [940]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[940]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [941]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [942]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [943]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [944]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9928418947763469
In [945]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [946]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0007558023623301819
R2 Score: 0.9985021888720862
RMSE: 0.027492
Entropy Value: 0.00025084148640829615
In [947]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[947]:
feature importance
1 human_development_index 0.828245
5 population 0.132583
2 extreme_poverty 0.033619
3 gdp_per_capita 0.003182
4 population_density 0.002366
0 hospital_beds_per_thousand 0.000005
In [229]:
# Country Pair by Pair Analysis relative to female smokers
In [230]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[230]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [231]:
# Showing the pairings of countries based on female smokers (13 pairs of countries)
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]

df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]

df_Iceland = df[(df.location == "Iceland")]
df_Italy = df[(df.location == "Italy")]

df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]

df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]

df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]

df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Romania = df[(df.location == "Romania")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Spain = df[(df.location == "Spain")]
df_Switzerland = df[(df.location == "Switzerland")]

df_Bulgaria= df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]

df_France = df[(df.location == "France")]
df_Serbia = df[(df.location == "Serbia")]
In [232]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [233]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [234]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[234]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [235]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [236]:
df_updated
Out[236]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.093162

2099 rows × 9 columns

In [237]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [238]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [239]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[239]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [240]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [241]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [242]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [243]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [244]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[244]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [245]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [246]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [247]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [248]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9982903099276361
In [249]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [250]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002818832110675199
R2 Score: 0.9991712781195069
RMSE: 0.053093
Entropy Value: 0.00033073646040135244
In [251]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[251]:
feature importance
0 cardiovasc_death_rate 0.802712
1 diabetes_prevalence 0.121281
2 male_smokers 0.037316
5 median_age 0.034987
3 life_expectancy 0.003115
4 aged_65_older 0.000589
In [252]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[252]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [253]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [254]:
df_updated
Out[254]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.093162

2099 rows × 9 columns

In [255]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [256]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [257]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[257]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [258]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [259]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [260]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [261]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [262]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[262]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [263]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [264]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [265]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [266]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9989558784228938
In [267]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [268]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032162020605789846
R2 Score: 0.9990544534349546
RMSE: 0.056712
Entropy Value: 0.000341364428693161
In [269]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[269]:
feature importance
5 population 0.515688
1 human_development_index 0.452145
2 extreme_poverty 0.028238
3 gdp_per_capita 0.003655
4 population_density 0.000192
0 hospital_beds_per_thousand 0.000083
In [270]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[270]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [271]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [272]:
df_updated
Out[272]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
5188 Denmark 2/3/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
5189 Denmark 2/4/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
5190 Denmark 2/5/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
5191 Denmark 2/6/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159
8372 Finland 12/26/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159
8373 Finland 12/27/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159
8374 Finland 12/28/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159
8375 Finland 12/29/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159

2128 rows × 9 columns

In [273]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [274]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [275]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[275]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [276]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [277]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [278]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [279]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [280]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[280]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [281]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [282]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [283]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [284]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989113781858382
In [285]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [286]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008523656273191014
R2 Score: 0.9951646978023675
RMSE: 0.092324
Entropy Value: 0.0017240442447165512
In [287]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[287]:
feature importance
1 diabetes_prevalence 0.955970
2 male_smokers 0.028467
3 life_expectancy 0.006111
5 median_age 0.005548
0 cardiovasc_death_rate 0.003586
4 aged_65_older 0.000319
In [288]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[288]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [289]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [290]:
df_updated
Out[290]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5188 Denmark 2/3/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5189 Denmark 2/4/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5190 Denmark 2/5/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5191 Denmark 2/6/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2128 rows × 9 columns

In [291]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [292]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [293]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[293]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [294]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [295]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [296]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [297]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [298]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[298]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [299]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [300]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [301]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [302]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989156833724155
In [303]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [304]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008329520038950185
R2 Score: 0.995274827461516
RMSE: 0.091266
Entropy Value: 0.0016508456649447665
In [305]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[305]:
feature importance
1 human_development_index 0.956574
2 extreme_poverty 0.028531
5 population 0.008398
3 gdp_per_capita 0.006081
4 population_density 0.000371
0 hospital_beds_per_thousand 0.000044
In [306]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[306]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [307]:
country1 = 'Iceland'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [308]:
df_updated
Out[308]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
20911 Iceland 2/28/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
20912 Iceland 2/29/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
20913 Iceland 3/1/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
20914 Iceland 3/2/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
20915 Iceland 3/3/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109

2100 rows × 9 columns

In [309]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [310]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [311]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[311]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [312]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [313]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [314]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [315]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [316]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[316]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [317]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [318]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [319]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [320]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986885167827337
In [321]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [322]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.025865966557999595
R2 Score: 0.9978789245215064
RMSE: 0.160829
Entropy Value: 0.001439826222505498
In [323]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[323]:
feature importance
0 cardiovasc_death_rate 0.493613
1 diabetes_prevalence 0.469690
5 median_age 0.024296
2 male_smokers 0.010599
3 life_expectancy 0.001623
4 aged_65_older 0.000179
In [324]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[324]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [325]:
country1 = 'Iceland'
country2 = 'Italy'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [326]:
df_updated
Out[326]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
20911 Iceland 2/28/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
20912 Iceland 2/29/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
20913 Iceland 3/1/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
20914 Iceland 3/2/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
20915 Iceland 3/3/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2100 rows × 9 columns

In [327]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [328]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [329]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[329]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [330]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [331]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [332]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [333]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [334]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[334]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [335]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [336]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [337]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [338]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.999041254481315
In [339]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [340]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.020316963384106206
R2 Score: 0.9983339569880426
RMSE: 0.142538
Entropy Value: 0.001115730684759821
In [341]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[341]:
feature importance
1 human_development_index 0.897053
5 population 0.078632
2 extreme_poverty 0.022551
3 gdp_per_capita 0.001409
4 population_density 0.000254
0 hospital_beds_per_thousand 0.000100
In [342]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[342]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [343]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [344]:
df_updated
Out[344]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.536669

2096 rows × 9 columns

In [345]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [346]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [347]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[347]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [348]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [349]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [350]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [351]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [352]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[352]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [353]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [354]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [355]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [356]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998368534343579
In [357]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [358]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0031857365835024896
R2 Score: 0.9984052886270314
RMSE: 0.056442
Entropy Value: 0.00046176412460026606
In [359]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[359]:
feature importance
1 diabetes_prevalence 0.932445
0 cardiovasc_death_rate 0.036452
2 male_smokers 0.027502
5 median_age 0.002234
3 life_expectancy 0.001093
4 aged_65_older 0.000274
In [360]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[360]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [361]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [362]:
df_updated
Out[362]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2096 rows × 9 columns

In [363]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [364]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [365]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[365]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [366]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [367]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [368]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [369]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [370]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[370]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [371]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [372]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [373]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [374]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.998167144839121
In [375]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [376]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038490153568325174
R2 Score: 0.9980732655059875
RMSE: 0.062040
Entropy Value: 0.000510878761187502
In [377]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[377]:
feature importance
1 human_development_index 0.964288
2 extreme_poverty 0.029185
5 population 0.004776
3 gdp_per_capita 0.001256
4 population_density 0.000316
0 hospital_beds_per_thousand 0.000179
In [378]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[378]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [379]:
country1 = 'Sweden'
country2 = 'United Kingdom'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [380]:
df_updated
Out[380]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.816005

2126 rows × 9 columns

In [381]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [382]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [383]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[383]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [384]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [385]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [386]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [387]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [388]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[388]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [389]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [390]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [391]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [392]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9636756360711152
In [393]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [394]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.3274210506772977
R2 Score: 0.9879826989155525
RMSE: 0.572207
Entropy Value: 0.0035336135372386534
In [395]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[395]:
feature importance
0 cardiovasc_death_rate 0.820240
5 median_age 0.074668
2 male_smokers 0.031916
1 diabetes_prevalence 0.031298
3 life_expectancy 0.027753
4 aged_65_older 0.014125
In [396]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[396]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [397]:
country1 = 'Sweden'
country2 = 'United Kingdom'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [398]:
df_updated
Out[398]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.816005

2126 rows × 9 columns

In [399]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [400]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [401]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[401]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [402]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [403]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [404]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [405]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [406]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[406]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [407]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [408]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [409]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [410]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9657040062307898
In [411]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [412]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.1755698880186287
R2 Score: 0.9935560764913614
RMSE: 0.419011
Entropy Value: 0.0030443221086945424
In [413]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[413]:
feature importance
1 human_development_index 0.795197
5 population 0.117897
2 extreme_poverty 0.038008
3 gdp_per_capita 0.030219
4 population_density 0.018509
0 hospital_beds_per_thousand 0.000171
In [414]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[414]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [415]:
country1 = 'United States'
country2 = 'Austria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [416]:
df_updated
Out[416]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.084791

2112 rows × 9 columns

In [417]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [418]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [419]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[419]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [420]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [421]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [422]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [423]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [424]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[424]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [425]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [426]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [427]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [428]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9726217165926947
In [429]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [430]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011712039224039057
R2 Score: 0.9925072626920814
RMSE: 0.108222
Entropy Value: 0.0007421212804027486
In [431]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[431]:
feature importance
1 diabetes_prevalence 0.885072
0 cardiovasc_death_rate 0.064410
2 male_smokers 0.022757
4 aged_65_older 0.011487
5 median_age 0.008366
3 life_expectancy 0.007908
In [432]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[432]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [433]:
country1 = 'United States'
country2 = 'Austria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [434]:
df_updated
Out[434]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2112 rows × 9 columns

In [435]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [436]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [437]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[437]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [438]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [439]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [440]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [441]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [442]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[442]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [443]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [444]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [445]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [446]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9748898922979944
In [447]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [448]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011356761380877888
R2 Score: 0.9927345504853691
RMSE: 0.106568
Entropy Value: 0.0008026741253104213
In [449]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[449]:
feature importance
1 human_development_index 0.941349
2 extreme_poverty 0.024552
5 population 0.015274
4 population_density 0.010674
3 gdp_per_capita 0.007128
0 hospital_beds_per_thousand 0.001023
In [450]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[450]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [451]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [452]:
df_updated
Out[452]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.464100
7306 Estonia 12/26/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.464100
7307 Estonia 12/27/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.463645
7308 Estonia 12/28/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.466423
7309 Estonia 12/29/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.466423

2121 rows × 9 columns

In [453]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [454]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [455]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[455]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [456]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [457]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [458]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [459]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [460]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[460]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [461]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [462]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [463]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [464]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998306333473295
In [465]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [466]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.017273039382732797
R2 Score: 0.9986315561079507
RMSE: 0.131427
Entropy Value: 0.000694873976607357
In [467]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[467]:
feature importance
1 diabetes_prevalence 0.723162
0 cardiovasc_death_rate 0.231758
2 male_smokers 0.038044
5 median_age 0.004786
3 life_expectancy 0.001834
4 aged_65_older 0.000415
In [468]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[468]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [469]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [470]:
df_updated
Out[470]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423

2121 rows × 9 columns

In [471]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [472]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [473]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[473]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [474]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [475]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [476]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [477]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [478]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[478]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [479]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [480]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [481]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [482]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984095979233917
In [483]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [484]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.016179009133714634
R2 Score: 0.9987182298530174
RMSE: 0.127197
Entropy Value: 0.0006399632348900017
In [485]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[485]:
feature importance
1 human_development_index 0.932614
2 extreme_poverty 0.038249
5 population 0.019770
0 hospital_beds_per_thousand 0.005374
3 gdp_per_capita 0.003504
4 population_density 0.000490
In [486]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[486]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [487]:
country1 = 'Ireland'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [488]:
df_updated
Out[488]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631969

2073 rows × 9 columns

In [489]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [490]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [491]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[491]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [492]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [493]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [494]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [495]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [496]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[496]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [497]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [498]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [499]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [500]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998374045977475
In [501]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [502]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003030155883805974
R2 Score: 0.9986252667840227
RMSE: 0.055047
Entropy Value: 0.0004986576827960621
In [503]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[503]:
feature importance
1 diabetes_prevalence 0.747948
0 cardiovasc_death_rate 0.211509
2 male_smokers 0.032826
5 median_age 0.003667
3 life_expectancy 0.003526
4 aged_65_older 0.000523
In [504]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[504]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [505]:
country1 = 'Ireland'
country2 = 'Latvia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [506]:
df_updated
Out[506]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631969

2073 rows × 9 columns

In [507]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [508]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [509]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[509]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [510]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [511]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [512]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [513]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [514]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[514]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [515]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [516]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [517]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [518]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9979400711573009
In [519]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [520]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004705897262529415
R2 Score: 0.9978650097467429
RMSE: 0.068600
Entropy Value: 0.0007148305943023663
In [521]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[521]:
feature importance
1 human_development_index 0.945415
2 extreme_poverty 0.036526
5 population 0.010191
0 hospital_beds_per_thousand 0.003836
3 gdp_per_capita 0.003491
4 population_density 0.000540
In [522]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[522]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [523]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [524]:
df_updated
Out[524]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872

2078 rows × 9 columns

In [525]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [526]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [527]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[527]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [528]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [529]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [530]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [531]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [532]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[532]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [533]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [534]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [535]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [536]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986500446225092
In [537]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [538]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009737687060063437
R2 Score: 0.9987308600470954
RMSE: 0.098680
Entropy Value: 0.001010970627044727
In [539]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[539]:
feature importance
1 diabetes_prevalence 0.761081
0 cardiovasc_death_rate 0.183282
2 male_smokers 0.032007
5 median_age 0.018832
3 life_expectancy 0.003997
4 aged_65_older 0.000802
In [540]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[540]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [541]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [542]:
df_updated
Out[542]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872

2078 rows × 9 columns

In [543]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [544]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [545]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[545]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [546]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [547]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [548]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [549]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [550]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[550]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [551]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [552]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [553]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [554]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985273943931006
In [555]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [556]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007512088981123421
R2 Score: 0.9990209284610492
RMSE: 0.086672
Entropy Value: 0.0009194530556352238
In [557]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[557]:
feature importance
1 human_development_index 0.943109
2 extreme_poverty 0.037435
5 population 0.012103
3 gdp_per_capita 0.006234
0 hospital_beds_per_thousand 0.000694
4 population_density 0.000425
In [558]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[558]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [559]:
country1 = 'Romania'
country2 = 'Slovakia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [560]:
df_updated
Out[560]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.036403

2067 rows × 9 columns

In [561]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [562]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [563]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[563]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [564]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [565]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [566]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [567]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [568]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[568]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [569]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [570]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [571]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [572]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9980209439218296
In [573]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [574]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0025858758735811936
R2 Score: 0.998541908086158
RMSE: 0.050852
Entropy Value: 0.00025608225493814756
In [575]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[575]:
feature importance
5 median_age 0.635662
0 cardiovasc_death_rate 0.191981
1 diabetes_prevalence 0.151447
2 male_smokers 0.017079
3 life_expectancy 0.003517
4 aged_65_older 0.000314
In [576]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[576]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [577]:
country1 = 'Romania'
country2 = 'Slovakia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [578]:
df_updated
Out[578]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403

2067 rows × 9 columns

In [579]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [580]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [581]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[581]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [582]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [583]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [584]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [585]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [586]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[586]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [587]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [588]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [589]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [590]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9947150727401987
In [591]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [592]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0023029360576114406
R2 Score: 0.9987014487129855
RMSE: 0.047989
Entropy Value: 0.00021180348784876973
In [593]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[593]:
feature importance
5 population 0.646503
1 human_development_index 0.328172
2 extreme_poverty 0.018557
3 gdp_per_capita 0.005316
4 population_density 0.000793
0 hospital_beds_per_thousand 0.000660
In [594]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[594]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [595]:
country1 = 'Spain'
country2 = 'Switzerland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [596]:
df_updated
Out[596]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148

2102 rows × 9 columns

In [597]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [598]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [599]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[599]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [600]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [601]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [602]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [603]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [604]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[604]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [605]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [606]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [607]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [608]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9988894734018896
In [609]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [610]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008646276456698085
R2 Score: 0.998600555703449
RMSE: 0.092985
Entropy Value: 0.00042564211305673786
In [611]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[611]:
feature importance
0 cardiovasc_death_rate 0.969299
2 male_smokers 0.017999
5 median_age 0.005651
1 diabetes_prevalence 0.004935
3 life_expectancy 0.001889
4 aged_65_older 0.000228
In [612]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[612]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [613]:
country1 = 'Spain'
country2 = 'Switzerland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [614]:
df_updated
Out[614]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148

2102 rows × 9 columns

In [615]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [616]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [617]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[617]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [618]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [619]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [620]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [621]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [622]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[622]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [623]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [624]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [625]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [626]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984087201734722
In [627]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [628]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009004477715309808
R2 Score: 0.9985425789881667
RMSE: 0.094892
Entropy Value: 0.0005624411655893531
In [629]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[629]:
feature importance
1 human_development_index 0.946716
5 population 0.027480
2 extreme_poverty 0.023142
3 gdp_per_capita 0.002243
4 population_density 0.000363
0 hospital_beds_per_thousand 0.000055
In [630]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[630]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [631]:
country1 = 'Bulgaria'
country2 = 'Czechia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [632]:
df_updated
Out[632]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919575

2061 rows × 9 columns

In [633]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [634]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [635]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[635]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [636]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [637]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [638]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [639]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [640]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[640]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [641]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [642]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [643]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [644]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.946009198558348
In [645]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [646]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0036901120196825726
R2 Score: 0.9977493146946407
RMSE: 0.060746
Entropy Value: 0.00036888555731396776
In [647]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[647]:
feature importance
0 cardiovasc_death_rate 0.830856
5 median_age 0.065756
3 life_expectancy 0.034074
2 male_smokers 0.031315
1 diabetes_prevalence 0.021356
4 aged_65_older 0.016643
In [648]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[648]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [649]:
country1 = 'Bulgaria'
country2 = 'Czechia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [650]:
df_updated
Out[650]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919575

2061 rows × 9 columns

In [651]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [652]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [653]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[653]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [654]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [655]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [656]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [657]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [658]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[658]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [659]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [660]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [661]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [662]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9509416306303644
In [663]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [664]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003709674750284998
R2 Score: 0.9977373829294084
RMSE: 0.060907
Entropy Value: 0.0006342010482872406
In [665]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[665]:
feature importance
5 population 0.598261
0 hospital_beds_per_thousand 0.292702
2 extreme_poverty 0.046316
3 gdp_per_capita 0.031321
1 human_development_index 0.026994
4 population_density 0.004406
In [666]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[666]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [667]:
country1 = 'France'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [668]:
df_updated
Out[668]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
8377 France 1/25/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
8378 France 1/26/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
8379 France 1/27/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
8380 France 1/28/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.716205

2109 rows × 9 columns

In [669]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [670]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [671]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[671]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [672]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [673]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [674]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [675]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [676]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[676]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [677]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [678]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [679]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [680]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9898468756077163
In [681]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [682]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.12251771563518532
R2 Score: 0.9894511723406865
RMSE: 0.350025
Entropy Value: 0.0026765662501610065
In [683]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[683]:
feature importance
0 cardiovasc_death_rate 0.675523
1 diabetes_prevalence 0.292794
2 male_smokers 0.020230
3 life_expectancy 0.006730
5 median_age 0.003406
4 aged_65_older 0.001317
In [684]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[684]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [685]:
country1 = 'France'
country2 = 'Serbia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [686]:
df_updated
Out[686]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
8377 France 1/25/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
8378 France 1/26/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
8379 France 1/27/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
8380 France 1/28/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716205

2109 rows × 9 columns

In [687]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [688]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [689]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[689]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [690]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [691]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [692]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [693]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [694]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[694]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [695]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [696]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [697]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [698]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9913712149726834
In [699]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [700]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.14086060776709658
R2 Score: 0.9878718414915131
RMSE: 0.375314
Entropy Value: 0.002701292210197016
In [701]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[701]:
feature importance
1 human_development_index 0.943928
2 extreme_poverty 0.024987
5 population 0.011892
3 gdp_per_capita 0.010286
0 hospital_beds_per_thousand 0.007449
4 population_density 0.001457
In [7]:
# Country Pair by Pair Analysis relative to life expectancy
In [8]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[8]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [9]:
# Showing the pairings of countries based on life expectancy (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]

df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]

df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]

df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]

df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]

df_UnitedStates = df[(df.location == "United States")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]

df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
In [10]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [11]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [12]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[12]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [13]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [14]:
df_updated
Out[14]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787
2095 Belgium 12/26/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787
2096 Belgium 12/27/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787
2097 Belgium 12/28/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787
2098 Belgium 12/29/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787

2099 rows × 9 columns

In [15]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [16]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [17]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[17]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [18]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [19]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [20]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [21]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [22]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[22]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [23]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [24]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [25]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [26]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9969932164978618
In [27]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [28]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01827006714784305
R2 Score: 0.9984568901791381
RMSE: 0.135167
Entropy Value: 0.0007004294195467055
In [29]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[29]:
feature importance
1 diabetes_prevalence 0.630882
0 cardiovasc_death_rate 0.309663
2 female_smokers 0.039951
5 median_age 0.015845
3 male_smokers 0.003381
4 aged_65_older 0.000277
In [30]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[30]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [31]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [32]:
df_updated
Out[32]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787

2099 rows × 9 columns

In [33]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [34]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [35]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[35]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [36]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [37]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [38]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [39]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [40]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[40]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [41]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [42]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [43]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [44]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9978932925070829
In [45]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [46]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008856177569188209
R2 Score: 0.9992519975722188
RMSE: 0.094107
Entropy Value: 0.000433455898033473
In [47]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[47]:
feature importance
1 human_development_index 0.927909
2 extreme_poverty 0.038889
5 population 0.030571
3 gdp_per_capita 0.002263
4 population_density 0.000344
0 hospital_beds_per_thousand 0.000024
In [48]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[48]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [49]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [50]:
df_updated
Out[50]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.093162

2099 rows × 9 columns

In [51]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [52]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [53]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[53]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [54]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [55]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [56]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [57]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [58]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[58]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [59]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [60]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [61]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [62]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9982932314972184
In [63]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [64]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032838118429310025
R2 Score: 0.9990345765129632
RMSE: 0.057305
Entropy Value: 0.00041277214242018204
In [65]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[65]:
feature importance
0 cardiovasc_death_rate 0.853954
1 diabetes_prevalence 0.102526
2 female_smokers 0.020535
5 median_age 0.020360
3 male_smokers 0.002201
4 aged_65_older 0.000425
In [66]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[66]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [67]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [68]:
df_updated
Out[68]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.093162

2099 rows × 9 columns

In [69]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [70]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [71]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[71]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [72]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [73]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [74]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [75]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [76]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[76]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [77]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [78]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [79]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [80]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9989558784228938
In [81]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [82]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032162020605789846
R2 Score: 0.9990544534349546
RMSE: 0.056712
Entropy Value: 0.000341364428693161
In [83]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[83]:
feature importance
5 population 0.515688
1 human_development_index 0.452145
2 extreme_poverty 0.028238
3 gdp_per_capita 0.003655
4 population_density 0.000192
0 hospital_beds_per_thousand 0.000083
In [84]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[84]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [85]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [86]:
df_updated
Out[86]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159
8372 Finland 12/26/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159
8373 Finland 12/27/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159
8374 Finland 12/28/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159
8375 Finland 12/29/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159

2128 rows × 9 columns

In [87]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [88]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [89]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[89]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [90]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [91]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [92]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [93]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [94]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[94]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [95]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [96]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [97]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [98]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989059824652742
In [99]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [100]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008510747064107163
R2 Score: 0.9951720209422329
RMSE: 0.092254
Entropy Value: 0.0017388182720325979
In [101]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[101]:
feature importance
1 diabetes_prevalence 0.955920
2 female_smokers 0.028636
3 male_smokers 0.005947
5 median_age 0.005500
0 cardiovasc_death_rate 0.003652
4 aged_65_older 0.000344
In [102]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[102]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [103]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [104]:
df_updated
Out[104]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5188 Denmark 2/3/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5189 Denmark 2/4/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5190 Denmark 2/5/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5191 Denmark 2/6/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2128 rows × 9 columns

In [105]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [106]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [107]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[107]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [108]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [109]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [110]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [111]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [112]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[112]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [113]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [114]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [115]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [116]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989156833724155
In [117]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [118]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008329520038950185
R2 Score: 0.995274827461516
RMSE: 0.091266
Entropy Value: 0.0016508456649447665
In [119]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[119]:
feature importance
1 human_development_index 0.956574
2 extreme_poverty 0.028531
5 population 0.008398
3 gdp_per_capita 0.006081
4 population_density 0.000371
0 hospital_beds_per_thousand 0.000044
In [120]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[120]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [121]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [122]:
df_updated
Out[122]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
8377 France 1/25/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
8378 France 1/26/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
8379 France 1/27/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
8380 France 1/28/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011

2107 rows × 9 columns

In [123]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [124]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [125]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[125]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [126]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [127]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [128]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [129]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [130]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[130]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [131]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [132]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [133]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [134]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9921689117203274
In [135]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [136]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.08132098431951777
R2 Score: 0.9935334250489188
RMSE: 0.285168
Entropy Value: 0.00169048811690724
In [137]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[137]:
feature importance
1 diabetes_prevalence 0.688876
0 cardiovasc_death_rate 0.269106
5 median_age 0.026844
2 female_smokers 0.009094
3 male_smokers 0.005258
4 aged_65_older 0.000822
In [138]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[138]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [139]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [140]:
df_updated
Out[140]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2107 rows × 9 columns

In [141]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [142]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [143]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[143]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [144]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [145]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [146]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [147]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [148]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[148]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [149]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [150]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [151]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [152]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9906321989356222
In [153]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [154]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.1263773231369439
R2 Score: 0.9899505836160186
RMSE: 0.355496
Entropy Value: 0.0032779127594959943
In [155]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[155]:
feature importance
1 human_development_index 0.958053
2 extreme_poverty 0.017445
5 population 0.014344
3 gdp_per_capita 0.005157
0 hospital_beds_per_thousand 0.004252
4 population_density 0.000749
In [156]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[156]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [157]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [158]:
df_updated
Out[158]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109

2099 rows × 9 columns

In [159]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [160]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [161]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[161]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [162]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [163]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [164]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [165]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [166]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[166]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [167]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [168]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [169]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [170]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998640466858235
In [171]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [172]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012181853469582376
R2 Score: 0.9989992331498202
RMSE: 0.110371
Entropy Value: 0.00035999671659794484
In [173]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[173]:
feature importance
5 median_age 0.823708
1 diabetes_prevalence 0.127188
0 cardiovasc_death_rate 0.028491
2 female_smokers 0.017857
3 male_smokers 0.002469
4 aged_65_older 0.000286
In [174]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[174]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [175]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [176]:
df_updated
Out[176]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2099 rows × 9 columns

In [177]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [178]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [179]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[179]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [180]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [181]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [182]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [183]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [184]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[184]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [185]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [186]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [187]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [188]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986413470709365
In [189]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [190]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013803653998387413
R2 Score: 0.9988659985635657
RMSE: 0.117489
Entropy Value: 0.0004399732479586434
In [191]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[191]:
feature importance
1 human_development_index 0.937618
2 extreme_poverty 0.027005
5 population 0.023078
0 hospital_beds_per_thousand 0.008608
3 gdp_per_capita 0.002772
4 population_density 0.000918
In [192]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[192]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [193]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [194]:
df_updated
Out[194]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872

2078 rows × 9 columns

In [195]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [196]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [197]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[197]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [198]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [199]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [200]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [201]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [202]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[202]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [203]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [204]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [205]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [206]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985636639377201
In [207]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [208]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01177642506262874
R2 Score: 0.9984651456288152
RMSE: 0.108519
Entropy Value: 0.0011725037332187857
In [209]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[209]:
feature importance
1 diabetes_prevalence 0.792799
0 cardiovasc_death_rate 0.162571
2 female_smokers 0.023739
5 median_age 0.015139
3 male_smokers 0.004875
4 aged_65_older 0.000876
In [210]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[210]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [211]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [212]:
df_updated
Out[212]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872

2078 rows × 9 columns

In [213]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [214]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [215]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[215]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [216]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [217]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [218]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [219]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [220]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[220]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [221]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [222]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [223]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [224]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985273943931006
In [225]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [226]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007512088981123421
R2 Score: 0.9990209284610492
RMSE: 0.086672
Entropy Value: 0.0009194530556352238
In [227]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[227]:
feature importance
1 human_development_index 0.943109
2 extreme_poverty 0.037435
5 population 0.012103
3 gdp_per_capita 0.006234
0 hospital_beds_per_thousand 0.000694
4 population_density 0.000425
In [228]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[228]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [229]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [230]:
df_updated
Out[230]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.536669

2096 rows × 9 columns

In [231]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [232]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [233]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[233]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [234]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [235]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [236]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [237]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [238]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[238]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [239]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [240]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [241]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [242]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983664740108746
In [243]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [244]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00327553495146727
R2 Score: 0.9983603374909553
RMSE: 0.057232
Entropy Value: 0.0004816810301686089
In [245]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[245]:
feature importance
1 diabetes_prevalence 0.930766
0 cardiovasc_death_rate 0.038511
2 female_smokers 0.027558
5 median_age 0.001745
3 male_smokers 0.001122
4 aged_65_older 0.000299
In [246]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[246]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [247]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [248]:
df_updated
Out[248]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2096 rows × 9 columns

In [249]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [250]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [251]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[251]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [252]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [253]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [254]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [255]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [256]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[256]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [257]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [258]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [259]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [260]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.998167144839121
In [261]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [262]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038490153568325174
R2 Score: 0.9980732655059875
RMSE: 0.062040
Entropy Value: 0.000510878761187502
In [263]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[263]:
feature importance
1 human_development_index 0.964288
2 extreme_poverty 0.029185
5 population 0.004776
3 gdp_per_capita 0.001256
4 population_density 0.000316
0 hospital_beds_per_thousand 0.000179
In [264]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[264]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [265]:
country1 = 'Spain'
country2 = 'Sweden'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [266]:
df_updated
Out[266]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
23011 Sweden 2/1/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
23012 Sweden 2/2/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
23013 Sweden 2/3/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
23014 Sweden 2/4/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
23015 Sweden 2/5/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148

2126 rows × 9 columns

In [267]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [268]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [269]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[269]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [270]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [271]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [272]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [273]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [274]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[274]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [275]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [276]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [277]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [278]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984490067368526
In [279]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [280]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.014451320867759824
R2 Score: 0.9983049038248775
RMSE: 0.120214
Entropy Value: 0.0005419190094358684
In [281]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[281]:
feature importance
1 diabetes_prevalence 0.971991
2 female_smokers 0.021755
5 median_age 0.002900
3 male_smokers 0.002066
0 cardiovasc_death_rate 0.001038
4 aged_65_older 0.000250
In [282]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[282]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [283]:
country1 = 'Spain'
country2 = 'Sweden'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [284]:
df_updated
Out[284]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
23011 Sweden 2/1/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
23012 Sweden 2/2/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
23013 Sweden 2/3/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
23014 Sweden 2/4/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
23015 Sweden 2/5/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148

2126 rows × 9 columns

In [285]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [286]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [287]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[287]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [288]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [289]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [290]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [291]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [292]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[292]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [293]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [294]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [295]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [296]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985661804087467
In [297]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [298]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01796724914647103
R2 Score: 0.9978924960850047
RMSE: 0.134042
Entropy Value: 0.0006245047380715956
In [299]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[299]:
feature importance
1 human_development_index 0.972786
2 extreme_poverty 0.022063
5 population 0.002947
3 gdp_per_capita 0.001796
4 population_density 0.000333
0 hospital_beds_per_thousand 0.000074
In [300]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[300]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [301]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [302]:
df_updated
Out[302]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.322149

2102 rows × 9 columns

In [303]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [304]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [305]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[305]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [306]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [307]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [308]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [309]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [310]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[310]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [311]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [312]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [313]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [314]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.96135172234504
In [315]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [316]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.28543037523612425
R2 Score: 0.988729253100176
RMSE: 0.534257
Entropy Value: 0.0037396223964769847
In [317]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[317]:
feature importance
0 cardiovasc_death_rate 0.823701
5 median_age 0.064555
1 diabetes_prevalence 0.055837
2 female_smokers 0.026659
3 male_smokers 0.019169
4 aged_65_older 0.010079
In [318]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[318]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [319]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [320]:
df_updated
Out[320]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.20 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.20 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.20 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.20 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.20 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14645 Switzerland 12/26/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14646 Switzerland 12/27/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14647 Switzerland 12/28/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.323082
14648 Switzerland 12/29/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322149

2102 rows × 9 columns

In [321]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [322]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [323]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[323]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [324]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [325]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [326]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [327]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [328]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[328]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [329]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [330]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [331]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [332]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.958329239005943
In [333]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [334]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.5800890923694779
R2 Score: 0.9770941080323481
RMSE: 0.761636
Entropy Value: 0.004716031800673148
In [335]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[335]:
feature importance
1 human_development_index 0.832172
5 population 0.070924
2 extreme_poverty 0.041400
3 gdp_per_capita 0.035553
4 population_density 0.019898
0 hospital_beds_per_thousand 0.000053
In [336]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[336]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [337]:
country1 = 'Czechia'
country2 = 'Estonia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [338]:
df_updated
Out[338]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.464100
7306 Estonia 12/26/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.464100
7307 Estonia 12/27/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.463645
7308 Estonia 12/28/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.466423
7309 Estonia 12/29/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.466423

2095 rows × 9 columns

In [339]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [340]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [341]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[341]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [342]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [343]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [344]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [345]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [346]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[346]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [347]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [348]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [349]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [350]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998089086726092
In [351]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [352]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0007961105439895268
R2 Score: 0.9988440348594501
RMSE: 0.028215
Entropy Value: 0.0002548819536094143
In [353]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[353]:
feature importance
1 diabetes_prevalence 0.891714
2 female_smokers 0.048162
0 cardiovasc_death_rate 0.047789
3 male_smokers 0.008997
5 median_age 0.002963
4 aged_65_older 0.000374
In [354]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[354]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [355]:
country1 = 'Czechia'
country2 = 'Estonia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [356]:
df_updated
Out[356]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423

2095 rows × 9 columns

In [357]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [358]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [359]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[359]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [360]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [361]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [362]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [363]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [364]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[364]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [365]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [366]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [367]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [368]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9968447535278674
In [369]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [370]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0019107375406635276
R2 Score: 0.9972255787761853
RMSE: 0.043712
Entropy Value: 0.0005664584928279815
In [371]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[371]:
feature importance
1 human_development_index 0.927005
2 extreme_poverty 0.053546
3 gdp_per_capita 0.010799
5 population 0.005969
0 hospital_beds_per_thousand 0.002133
4 population_density 0.000548
In [372]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[372]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [374]:
country1 = 'United States'
country2 = 'Bulgaria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [375]:
df_updated
Out[375]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.084791

2100 rows × 9 columns

In [376]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [377]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [378]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[378]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [379]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [380]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [381]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [382]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [383]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[383]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [384]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [385]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [386]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [387]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.949358596400903
In [388]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [389]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.04053191186316277
R2 Score: 0.9796126270982285
RMSE: 0.201325
Entropy Value: 0.0013052064555693329
In [390]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[390]:
feature importance
0 cardiovasc_death_rate 0.746023
1 diabetes_prevalence 0.151873
2 female_smokers 0.040737
3 male_smokers 0.022668
5 median_age 0.020063
4 aged_65_older 0.018636
In [391]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[391]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [392]:
country1 = 'United States'
country2 = 'Bulgaria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [393]:
df_updated
Out[393]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.084791

2100 rows × 9 columns

In [394]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [395]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [396]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[396]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [397]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [398]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [399]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [400]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [401]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[401]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [402]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [403]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [404]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [405]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9463618954147719
In [406]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [407]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.021173310142172767
R2 Score: 0.9893499183830592
RMSE: 0.145511
Entropy Value: 0.0007566541098966247
In [408]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[408]:
feature importance
1 human_development_index 0.651323
5 population 0.119828
0 hospital_beds_per_thousand 0.104203
2 extreme_poverty 0.062823
3 gdp_per_capita 0.031452
4 population_density 0.030370
In [409]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[409]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [410]:
country1 = 'Latvia'
country2 = 'Romania'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [411]:
df_updated
Out[411]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
17800 Romania 2/26/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
17801 Romania 2/27/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
17802 Romania 2/28/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
17803 Romania 2/29/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
17804 Romania 3/1/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631969

2076 rows × 9 columns

In [412]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [413]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [414]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[414]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [415]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [416]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [417]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [418]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [419]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[419]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [420]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [421]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [422]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [423]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.997239476634585
In [424]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [425]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004065478119768843
R2 Score: 0.9972309144609833
RMSE: 0.063761
Entropy Value: 0.0002428106325496918
In [426]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[426]:
feature importance
0 cardiovasc_death_rate 0.852131
5 median_age 0.077616
1 diabetes_prevalence 0.048967
2 female_smokers 0.014654
3 male_smokers 0.006080
4 aged_65_older 0.000552
In [427]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[427]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [428]:
country1 = 'Latvia'
country2 = 'Romania'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [429]:
df_updated
Out[429]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
17800 Romania 2/26/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
17801 Romania 2/27/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
17802 Romania 2/28/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
17803 Romania 2/29/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
17804 Romania 3/1/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631969

2076 rows × 9 columns

In [430]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [431]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [432]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[432]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [433]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [434]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [435]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [436]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [437]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[437]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [438]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [439]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [440]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [441]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9961638218621977
In [442]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [443]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007175888410943785
R2 Score: 0.9951123463851107
RMSE: 0.084711
Entropy Value: 0.0005117984164720532
In [444]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[444]:
feature importance
1 human_development_index 0.697452
5 population 0.195946
0 hospital_beds_per_thousand 0.073939
2 extreme_poverty 0.019689
3 gdp_per_capita 0.012548
4 population_density 0.000427
In [445]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[445]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [446]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [447]:
df_updated
Out[447]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.716205

2067 rows × 9 columns

In [448]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [449]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [450]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[450]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [451]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [452]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [453]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [454]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [455]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[455]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [456]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [457]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [458]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [459]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9926858700401283
In [460]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [461]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0007922951938540062
R2 Score: 0.9967575480800499
RMSE: 0.028148
Entropy Value: 0.00046542512044971915
In [462]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[462]:
feature importance
1 diabetes_prevalence 0.871068
5 median_age 0.063050
0 cardiovasc_death_rate 0.030402
2 female_smokers 0.030107
3 male_smokers 0.003771
4 aged_65_older 0.001603
In [463]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[463]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [464]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [465]:
df_updated
Out[465]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716205

2067 rows × 9 columns

In [466]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [467]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [468]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[468]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [469]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [470]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [471]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [472]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [473]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[473]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [474]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [475]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [476]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [477]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9928368516844188
In [478]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [479]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0006865654195289836
R2 Score: 0.9971902450248445
RMSE: 0.026202
Entropy Value: 0.0004341579931648298
In [480]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[480]:
feature importance
1 human_development_index 0.877517
5 population 0.080276
2 extreme_poverty 0.031016
0 hospital_beds_per_thousand 0.005715
3 gdp_per_capita 0.003793
4 population_density 0.001682
In [2]:
# Country Pair by Pair Analysis relative to aged_65_older
In [3]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[3]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [4]:
# Showing the pairings of countries based on aged_65_older (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Finland = df[(df.location == "Finland")]

df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]

df_Sweden = df[(df.location == "Sweden")]
df_Austria = df[(df.location == "Austria")]

df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]

df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]

df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]

df_Latvia = df[(df.location == "Latvia")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]

df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]

df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Slovakia = df[(df.location == "Slovakia")]
df_UnitedStates = df[(df.location == "United States")]
In [5]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [6]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [7]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[7]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [8]:
country1 = 'Bulgaria'
country2 = 'Finland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [9]:
df_updated
Out[9]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590
8372 Finland 12/26/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590
8373 Finland 12/27/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590
8374 Finland 12/28/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590
8375 Finland 12/29/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590

2093 rows × 9 columns

In [10]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [11]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [12]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[12]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [13]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [14]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [15]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [16]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [17]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[17]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [18]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [19]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [20]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [21]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9591779911867772
In [22]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [23]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009184792684689342
R2 Score: 0.996290335239657
RMSE: 0.095837
Entropy Value: 0.001116882714947321
In [24]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[24]:
feature importance
0 cardiovasc_death_rate 0.836592
5 median_age 0.056852
1 diabetes_prevalence 0.052026
2 female_smokers 0.030780
3 male_smokers 0.018852
4 life_expectancy 0.004898
In [25]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[25]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [26]:
country1 = 'Bulgaria'
country2 = 'Finland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [27]:
df_updated
Out[27]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.50 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.50 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590
8372 Finland 12/26/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590
8373 Finland 12/27/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590
8374 Finland 12/28/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590
8375 Finland 12/29/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590

2093 rows × 9 columns

In [28]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [29]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [30]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[30]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [31]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [32]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [33]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [34]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [35]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[35]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [36]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [37]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [38]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [39]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9598292232562027
In [40]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [41]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007393669469321995
R2 Score: 0.9970137556696637
RMSE: 0.085986
Entropy Value: 0.0009964875018581088
In [42]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[42]:
feature importance
5 population 0.564647
1 human_development_index 0.222710
0 hospital_beds_per_thousand 0.148938
2 extreme_poverty 0.034093
3 gdp_per_capita 0.023922
4 population_density 0.005690
In [43]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[43]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [44]:
country1 = 'Italy'
country2 = 'Portugal'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [45]:
df_updated
Out[45]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109

2098 rows × 9 columns

In [46]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [47]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [48]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[48]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [49]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [50]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [51]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [52]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [53]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[53]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [54]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [55]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [56]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [57]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9991914852672478
In [58]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [59]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.04174502844368979
R2 Score: 0.996335689684427
RMSE: 0.204316
Entropy Value: 0.0018627830616119736
In [60]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[60]:
feature importance
0 cardiovasc_death_rate 0.917329
1 diabetes_prevalence 0.052036
2 female_smokers 0.025708
5 median_age 0.002549
3 male_smokers 0.002091
4 life_expectancy 0.000287
In [61]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[61]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [62]:
country1 = 'Italy'
country2 = 'Portugal'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [63]:
df_updated
Out[63]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2098 rows × 9 columns

In [64]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [65]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [66]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[66]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [67]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [68]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [69]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [70]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [71]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[71]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [72]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [73]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [74]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [75]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9991962488532368
In [76]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [77]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0346698323414571
R2 Score: 0.996956738825574
RMSE: 0.186198
Entropy Value: 0.0012806717112898633
In [78]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[78]:
feature importance
1 human_development_index 0.923655
5 population 0.047005
2 extreme_poverty 0.026004
3 gdp_per_capita 0.003020
4 population_density 0.000296
0 hospital_beds_per_thousand 0.000020
In [79]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[79]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [80]:
country1 = 'Sweden'
country2 = 'Austria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [81]:
df_updated
Out[81]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.816005

2102 rows × 9 columns

In [82]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [83]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [84]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[84]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [85]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [86]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [87]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [88]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [89]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[89]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [90]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [91]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [92]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [93]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9957633295820632
In [94]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [95]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.02144933089134254
R2 Score: 0.9954134974658286
RMSE: 0.146456
Entropy Value: 0.0008648401202367788
In [96]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[96]:
feature importance
0 cardiovasc_death_rate 0.634096
1 diabetes_prevalence 0.222993
5 median_age 0.121456
2 female_smokers 0.016268
3 male_smokers 0.003877
4 life_expectancy 0.001309
In [97]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[97]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [98]:
country1 = 'Sweden'
country2 = 'Austria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [99]:
df_updated
Out[99]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.816005

2102 rows × 9 columns

In [100]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [101]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [102]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[102]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [103]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [104]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [105]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [106]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [107]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[107]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [108]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [109]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [110]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [111]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9959755632813897
In [112]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [113]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.02494529961192575
R2 Score: 0.994665955760329
RMSE: 0.157941
Entropy Value: 0.0010168948933280927
In [114]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[114]:
feature importance
1 human_development_index 0.948503
2 extreme_poverty 0.020497
5 population 0.019786
0 hospital_beds_per_thousand 0.006435
3 gdp_per_capita 0.003427
4 population_density 0.001354
In [115]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[115]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [116]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [117]:
df_updated
Out[117]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.093162

2132 rows × 9 columns

In [118]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [119]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [120]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[120]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [121]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [122]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [123]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [124]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [125]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[125]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [126]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [127]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [128]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [129]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974429094620534
In [130]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [131]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.034222729995244096
R2 Score: 0.9974121725749812
RMSE: 0.184994
Entropy Value: 0.0007442340019509717
In [132]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[132]:
feature importance
1 diabetes_prevalence 0.900768
0 cardiovasc_death_rate 0.067474
2 female_smokers 0.024148
5 median_age 0.004890
3 male_smokers 0.002331
4 life_expectancy 0.000390
In [137]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[137]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [138]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [139]:
df_updated
Out[139]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.093162

2132 rows × 9 columns

In [140]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [141]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [142]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[142]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [143]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [144]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [145]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [146]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [147]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[147]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [148]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [149]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [150]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [151]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975469171045417
In [152]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [153]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03171734845754228
R2 Score: 0.9976016225415473
RMSE: 0.178094
Entropy Value: 0.0007572895754231443
In [154]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[154]:
feature importance
1 human_development_index 0.965414
2 extreme_poverty 0.021985
5 population 0.007566
0 hospital_beds_per_thousand 0.002465
3 gdp_per_capita 0.001925
4 population_density 0.000644
In [155]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[155]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [156]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [157]:
df_updated
Out[157]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.227772
6245 Denmark 12/26/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.227772
6246 Denmark 12/27/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.228905
6247 Denmark 12/28/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.229131
6248 Denmark 12/29/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.229131

2096 rows × 9 columns

In [158]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [159]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [160]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[160]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [161]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [162]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [163]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [164]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [165]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[165]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [166]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [167]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [168]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [169]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998749456645679
In [170]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [171]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00293085842141847
R2 Score: 0.99756672605
RMSE: 0.054137
Entropy Value: 0.0006772694785891467
In [172]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[172]:
feature importance
1 diabetes_prevalence 0.954483
2 female_smokers 0.023955
0 cardiovasc_death_rate 0.016581
5 median_age 0.003026
3 male_smokers 0.001699
4 life_expectancy 0.000256
In [173]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[173]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [174]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [175]:
df_updated
Out[175]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.227772
6245 Denmark 12/26/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.227772
6246 Denmark 12/27/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.228905
6247 Denmark 12/28/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.229131
6248 Denmark 12/29/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.229131

2096 rows × 9 columns

In [176]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [177]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [178]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[178]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [179]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [180]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [181]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [182]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [183]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[183]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [184]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [185]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [186]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [187]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984672041343956
In [188]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [189]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0024692245202902934
R2 Score: 0.9979499863732701
RMSE: 0.049691
Entropy Value: 0.0005994641511856433
In [190]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[190]:
feature importance
1 human_development_index 0.967631
2 extreme_poverty 0.025845
5 population 0.003256
3 gdp_per_capita 0.001799
0 hospital_beds_per_thousand 0.001150
4 population_density 0.000319
In [191]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[191]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [192]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [193]:
df_updated
Out[193]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411892

2132 rows × 9 columns

In [194]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [195]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [196]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[196]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [197]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [198]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [199]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [200]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [201]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[201]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [202]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [203]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [204]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [205]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9950083298202264
In [206]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [207]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.09514590931040419
R2 Score: 0.9900939715866824
RMSE: 0.308457
Entropy Value: 0.005407421439201657
In [208]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[208]:
feature importance
1 diabetes_prevalence 0.789072
0 cardiovasc_death_rate 0.159897
5 median_age 0.031527
2 female_smokers 0.015592
3 male_smokers 0.003481
4 life_expectancy 0.000430
In [209]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[209]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [210]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [211]:
df_updated
Out[211]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6250 Estonia 1/18/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6251 Estonia 2/5/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6252 Estonia 2/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6253 Estonia 2/7/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411892

2132 rows × 9 columns

In [212]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [213]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [214]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[214]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [215]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [216]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [217]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [218]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [219]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[219]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [220]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [221]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [222]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [223]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9924204242316919
In [224]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [225]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.09564363505394957
R2 Score: 0.9900421513308949
RMSE: 0.309263
Entropy Value: 0.005800126883421082
In [226]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[226]:
feature importance
1 human_development_index 0.929469
5 population 0.033558
2 extreme_poverty 0.029972
3 gdp_per_capita 0.003907
0 hospital_beds_per_thousand 0.002285
4 population_density 0.000808
In [227]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[227]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [228]:
country1 = 'Latvia'
country2 = 'Netherlands'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [229]:
df_updated
Out[229]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631969

2075 rows × 9 columns

In [230]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [231]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [232]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[232]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [233]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [234]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [235]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [236]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [237]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[237]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [238]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [239]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [240]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [241]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985833133413833
In [242]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [243]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008546819092992161
R2 Score: 0.9988517930067208
RMSE: 0.092449
Entropy Value: 0.00036524704725572567
In [244]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[244]:
feature importance
1 diabetes_prevalence 0.679384
0 cardiovasc_death_rate 0.274804
2 female_smokers 0.036255
5 median_age 0.005677
3 male_smokers 0.003251
4 life_expectancy 0.000629
In [245]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[245]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [246]:
country1 = 'Latvia'
country2 = 'Netherlands'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [247]:
df_updated
Out[247]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631969

2075 rows × 9 columns

In [248]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [249]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [250]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[250]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [251]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [252]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [253]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [254]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [255]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[255]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [256]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [257]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [258]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [259]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9982718550442117
In [260]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [261]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012318889250834996
R2 Score: 0.9983450410458743
RMSE: 0.110990
Entropy Value: 0.0005681485654265903
In [262]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[262]:
feature importance
1 human_development_index 0.946276
2 extreme_poverty 0.036129
5 population 0.011335
3 gdp_per_capita 0.003979
0 hospital_beds_per_thousand 0.001546
4 population_density 0.000735
In [263]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[263]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [264]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [265]:
df_updated
Out[265]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.036403

2076 rows × 9 columns

In [266]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [267]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [268]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[268]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [269]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [270]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [271]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [272]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [273]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[273]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [274]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [275]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [276]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [277]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9957145121672465
In [278]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [279]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0035860097157909237
R2 Score: 0.9979121949707439
RMSE: 0.059883
Entropy Value: 0.00028292645399796484
In [280]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[280]:
feature importance
5 median_age 0.414324
0 cardiovasc_death_rate 0.413998
1 diabetes_prevalence 0.146518
2 female_smokers 0.021706
3 male_smokers 0.002908
4 life_expectancy 0.000546
In [281]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[281]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [282]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [283]:
df_updated
Out[283]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 9 columns

In [284]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [285]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [286]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[286]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [287]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [288]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [289]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [290]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [291]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[291]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [292]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [293]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [294]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [295]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9942151184753699
In [296]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [297]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003107698404524249
R2 Score: 0.9981906718406797
RMSE: 0.055747
Entropy Value: 0.0004506715934095217
In [298]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[298]:
feature importance
5 population 0.753233
1 human_development_index 0.216832
2 extreme_poverty 0.021982
3 gdp_per_capita 0.007136
4 population_density 0.000814
0 hospital_beds_per_thousand 0.000003
In [299]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[299]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [300]:
country1 = 'Slovenia'
country2 = 'Spain'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [301]:
df_updated
Out[301]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
24074 Spain 2/1/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
24075 Spain 2/2/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
24076 Spain 2/3/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
24077 Spain 2/4/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
24078 Spain 2/5/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.536669

2125 rows × 9 columns

In [302]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [303]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [304]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[304]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [305]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [306]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [307]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [308]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [309]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[309]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [310]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [311]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [312]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [313]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985825851316085
In [314]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [315]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004655331865096316
R2 Score: 0.999286023415865
RMSE: 0.068230
Entropy Value: 0.00032059930777523355
In [316]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[316]:
feature importance
1 diabetes_prevalence 0.897896
0 cardiovasc_death_rate 0.073585
2 female_smokers 0.020414
3 male_smokers 0.004171
5 median_age 0.003642
4 life_expectancy 0.000293
In [317]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[317]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [318]:
country1 = 'Slovenia'
country2 = 'Spain'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [319]:
df_updated
Out[319]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
24074 Spain 2/1/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
24075 Spain 2/2/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
24076 Spain 2/3/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
24077 Spain 2/4/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
24078 Spain 2/5/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.536669

2125 rows × 9 columns

In [320]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [321]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [322]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[322]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [323]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [324]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [325]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [326]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [327]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[327]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [328]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [329]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [330]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [331]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983652038818789
In [332]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [333]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006147021306110068
R2 Score: 0.9990572467437505
RMSE: 0.078403
Entropy Value: 0.00037964640931083954
In [334]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[334]:
feature importance
1 human_development_index 0.956053
2 extreme_poverty 0.023320
5 population 0.012961
0 hospital_beds_per_thousand 0.004916
3 gdp_per_capita 0.002334
4 population_density 0.000416
In [335]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[335]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [336]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [337]:
df_updated
Out[337]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.322149

2102 rows × 9 columns

In [338]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [339]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [340]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[340]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [341]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [342]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [343]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [345]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [346]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[346]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [347]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [348]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [349]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [350]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Best CV score: 0.9600280576636526
In [351]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [352]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.7087942445483512
R2 Score: 0.9720119467742431
RMSE: 0.841899
Entropy Value: 0.005512737732013974
In [353]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[353]:
feature importance
0 cardiovasc_death_rate 0.816569
1 diabetes_prevalence 0.048619
5 median_age 0.047136
2 female_smokers 0.042191
3 male_smokers 0.032730
4 life_expectancy 0.012756
In [354]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[354]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [355]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [356]:
df_updated
Out[356]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.20 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.20 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.20 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.20 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.20 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14645 Switzerland 12/26/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14646 Switzerland 12/27/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14647 Switzerland 12/28/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.323082
14648 Switzerland 12/29/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322149

2102 rows × 9 columns

In [357]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [358]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [359]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[359]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [360]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [361]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [362]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [363]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [364]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[364]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [365]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [366]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [367]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [368]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.958329239005943
In [369]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [370]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.5800890923694779
R2 Score: 0.9770941080323481
RMSE: 0.761636
Entropy Value: 0.004716031800673148
In [371]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[371]:
feature importance
1 human_development_index 0.832172
5 population 0.070924
2 extreme_poverty 0.041400
3 gdp_per_capita 0.035553
4 population_density 0.019898
0 hospital_beds_per_thousand 0.000053
In [372]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[372]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [373]:
country1 = 'Cyprus'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [374]:
df_updated
Out[374]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011

2063 rows × 9 columns

In [375]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [376]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [377]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[377]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [378]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [379]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [380]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [381]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [382]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[382]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [383]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [384]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [385]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [386]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9764230567132627
In [387]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [388]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0006385841997698732
R2 Score: 0.99674427235998
RMSE: 0.025270
Entropy Value: 0.0005132911833312269
In [389]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[389]:
feature importance
1 diabetes_prevalence 0.522674
0 cardiovasc_death_rate 0.418221
2 female_smokers 0.031807
5 median_age 0.012626
3 male_smokers 0.007793
4 life_expectancy 0.006879
In [390]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[390]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [391]:
country1 = 'Cyprus'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [392]:
df_updated
Out[392]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2063 rows × 9 columns

In [393]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [394]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [395]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[395]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [396]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [397]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [398]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [399]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [400]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[400]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [401]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [402]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [403]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [404]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9766546149550385
In [405]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [406]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0006872215957745136
R2 Score: 0.9964963017484805
RMSE: 0.026215
Entropy Value: 0.0005144780719953724
In [407]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[407]:
feature importance
1 human_development_index 0.918759
2 extreme_poverty 0.029885
5 population 0.021024
0 hospital_beds_per_thousand 0.013524
3 gdp_per_capita 0.010039
4 population_density 0.006769
In [408]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[408]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [409]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [410]:
df_updated
Out[410]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388

2076 rows × 9 columns

In [411]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [412]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [413]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[413]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [414]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [415]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [416]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [417]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [418]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[418]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [419]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [420]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [421]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [422]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9979483134774702
In [423]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [424]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0024530560615069748
R2 Score: 0.9989257488789922
RMSE: 0.049528
Entropy Value: 0.0004755467309312025
In [425]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[425]:
feature importance
0 cardiovasc_death_rate 0.943395
2 female_smokers 0.030175
1 diabetes_prevalence 0.018569
5 median_age 0.005552
3 male_smokers 0.001946
4 life_expectancy 0.000363
In [426]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[426]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [427]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [428]:
df_updated
Out[428]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388

2076 rows × 9 columns

In [429]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [430]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [431]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[431]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [432]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [433]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [434]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [435]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [436]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[436]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [437]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [438]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [439]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [440]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998351364576265
In [441]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [442]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001882911265242772
R2 Score: 0.9991754287359407
RMSE: 0.043393
Entropy Value: 0.0004605212242426924
In [443]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[443]:
feature importance
5 population 0.523426
1 human_development_index 0.437981
2 extreme_poverty 0.036239
3 gdp_per_capita 0.001743
4 population_density 0.000594
0 hospital_beds_per_thousand 0.000017
In [444]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[444]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [445]:
country1 = 'Slovakia'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [446]:
df_updated
Out[446]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.084791

2102 rows × 9 columns

In [447]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [448]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [449]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[449]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [450]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [451]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [452]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [453]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [454]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[454]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [455]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [456]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [457]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [458]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9737847333872919
In [459]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [460]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.016014923250739586
R2 Score: 0.984946595950419
RMSE: 0.126550
Entropy Value: 0.001487608489672066
In [461]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[461]:
feature importance
1 diabetes_prevalence 0.540912
0 cardiovasc_death_rate 0.306352
5 median_age 0.082899
2 female_smokers 0.042828
3 male_smokers 0.019670
4 life_expectancy 0.007340
In [462]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[462]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [463]:
country1 = 'Slovakia'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [464]:
df_updated
Out[464]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2102 rows × 9 columns

In [465]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [466]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [467]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[467]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [468]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [469]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [470]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [471]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [472]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[472]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [473]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [474]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [475]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [476]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9745517168509374
In [477]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [478]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01914917351005437
R2 Score: 0.9820005227905748
RMSE: 0.138381
Entropy Value: 0.0017834873721826897
In [479]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[479]:
feature importance
1 human_development_index 0.738239
5 population 0.187593
2 extreme_poverty 0.045517
3 gdp_per_capita 0.018252
4 population_density 0.010388
0 hospital_beds_per_thousand 0.000009
In [6]:
# Country Pair by Pair Analysis relative to diabetes prevalence
In [7]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[7]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [8]:
# Showing the pairings of countries based on diabetes prevalence (13 pairs of countries)
df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]

df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]

df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Austria = df[(df.location == "Austria")]

df_Bulgaria = df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]

df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]

df_Switzerland = df[(df.location == "Switzerland")]
df_Canada = df[(df.location == "Canada")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Portugal = df[(df.location == "Portugal")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Spain = df[(df.location == "Spain")]
df_UnitedStates = df[(df.location == "United States")]
In [9]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [10]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [11]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[11]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [10]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [11]:
df_updated
Out[11]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.464100
7306 Estonia 12/26/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.464100
7307 Estonia 12/27/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.463645
7308 Estonia 12/28/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.466423
7309 Estonia 12/29/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.466423

2121 rows × 9 columns

In [12]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [13]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [14]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[14]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [15]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [16]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [17]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [18]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [19]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[19]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [20]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [21]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [22]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [23]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986082260800637
In [24]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [25]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.015400807395380635
R2 Score: 0.9987798823156795
RMSE: 0.124100
Entropy Value: 0.0006057917033947405
In [26]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[26]:
feature importance
1 female_smokers 0.721427
0 cardiovasc_death_rate 0.231062
2 male_smokers 0.038212
5 median_age 0.006339
3 life_expectancy 0.002593
4 aged_65_older 0.000368
In [27]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[27]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [28]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [29]:
df_updated
Out[29]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423

2121 rows × 9 columns

In [30]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [31]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [32]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[32]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [33]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [34]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [35]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [36]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [37]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[37]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [38]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [39]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [40]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [41]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984095979233917
In [42]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [43]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.016179009133714634
R2 Score: 0.9987182298530174
RMSE: 0.127197
Entropy Value: 0.0006399632348900017
In [44]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[44]:
feature importance
1 human_development_index 0.932614
2 extreme_poverty 0.038249
5 population 0.019770
0 hospital_beds_per_thousand 0.005374
3 gdp_per_capita 0.003504
4 population_density 0.000490
In [45]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[45]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [46]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [47]:
df_updated
Out[47]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
8377 France 1/25/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
8378 France 1/26/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
8379 France 1/27/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
8380 France 1/28/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011

2107 rows × 9 columns

In [48]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [49]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [50]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[50]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [51]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [52]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [53]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [54]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [55]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[55]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [56]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [57]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [58]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [59]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.990388976505925
In [60]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [61]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.1330022202706
R2 Score: 0.9894237774759248
RMSE: 0.364695
Entropy Value: 0.0029852162511977524
In [62]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[62]:
feature importance
1 female_smokers 0.704205
0 cardiovasc_death_rate 0.270532
2 male_smokers 0.016687
3 life_expectancy 0.005344
5 median_age 0.002318
4 aged_65_older 0.000914
In [63]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[63]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [64]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [65]:
df_updated
Out[65]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2107 rows × 9 columns

In [66]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [67]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [68]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[68]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [69]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [70]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [71]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [72]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [73]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[73]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [74]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [75]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [76]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [77]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9906321989356222
In [78]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [79]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.1263773231369439
R2 Score: 0.9899505836160186
RMSE: 0.355496
Entropy Value: 0.0032779127594959943
In [80]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[80]:
feature importance
1 human_development_index 0.958053
2 extreme_poverty 0.017445
5 population 0.014344
3 gdp_per_capita 0.005157
0 hospital_beds_per_thousand 0.004252
4 population_density 0.000749
In [81]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[81]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [82]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [83]:
df_updated
Out[83]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109

2099 rows × 9 columns

In [84]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [85]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [86]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[86]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [87]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [88]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [89]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [90]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [91]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[91]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [92]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [93]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [94]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [95]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9980680064496482
In [96]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [97]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.019286091351583046
R2 Score: 0.9984156039191903
RMSE: 0.138874
Entropy Value: 0.0005198299298636055
In [98]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[98]:
feature importance
0 cardiovasc_death_rate 0.837218
1 female_smokers 0.131195
2 male_smokers 0.026871
3 life_expectancy 0.002415
5 median_age 0.001822
4 aged_65_older 0.000480
In [99]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[99]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [100]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [101]:
df_updated
Out[101]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2099 rows × 9 columns

In [102]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [103]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [104]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[104]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [105]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [106]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [107]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [108]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [109]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[109]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [110]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [111]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [112]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [113]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986413470709365
In [114]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [115]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013803653998387413
R2 Score: 0.9988659985635657
RMSE: 0.117489
Entropy Value: 0.0004399732479586434
In [116]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[116]:
feature importance
1 human_development_index 0.937618
2 extreme_poverty 0.027005
5 population 0.023078
0 hospital_beds_per_thousand 0.008608
3 gdp_per_capita 0.002772
4 population_density 0.000918
In [117]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[117]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [118]:
country1 = 'Latvia'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [119]:
df_updated
Out[119]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631969

2079 rows × 9 columns

In [120]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [121]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [122]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[122]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [123]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [124]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [125]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [126]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [127]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[127]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [128]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [129]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [130]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [131]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9972959686205798
In [132]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [133]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032690908969740737
R2 Score: 0.9917683753526738
RMSE: 0.057176
Entropy Value: 0.0011809055355708035
In [134]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[134]:
feature importance
1 female_smokers 0.916922
2 male_smokers 0.032989
0 cardiovasc_death_rate 0.029222
5 median_age 0.016208
3 life_expectancy 0.004059
4 aged_65_older 0.000599
In [135]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[135]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [136]:
country1 = 'Latvia'
country2 = 'Luxembourg'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [137]:
df_updated
Out[137]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631969

2079 rows × 9 columns

In [138]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [139]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [140]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[140]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [141]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [142]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [143]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [144]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [145]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[145]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [146]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [147]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [148]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [149]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.997384664409495
In [150]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [151]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0031257920224898803
R2 Score: 0.9921292042755496
RMSE: 0.055909
Entropy Value: 0.0010800758453721978
In [152]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[152]:
feature importance
1 human_development_index 0.919383
5 population 0.042867
2 extreme_poverty 0.034274
3 gdp_per_capita 0.002756
4 population_density 0.000677
0 hospital_beds_per_thousand 0.000044
In [153]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[153]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [154]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [155]:
df_updated
Out[155]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.816005

2100 rows × 9 columns

In [156]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [157]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [158]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[158]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [159]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [160]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [161]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [162]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [163]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[163]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [164]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [165]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [166]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [167]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984774091696321
In [168]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [169]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010199687742194393
R2 Score: 0.9990245151847958
RMSE: 0.100994
Entropy Value: 0.00037837583326060144
In [170]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[170]:
feature importance
1 female_smokers 0.973528
2 male_smokers 0.023249
3 life_expectancy 0.001411
0 cardiovasc_death_rate 0.000862
5 median_age 0.000565
4 aged_65_older 0.000385
In [171]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[171]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [172]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [173]:
df_updated
Out[173]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.816005

2100 rows × 9 columns

In [174]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [175]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [176]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[176]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [177]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [178]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [179]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [180]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [181]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[181]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [182]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [183]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [184]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [185]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984750138014478
In [186]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [187]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010845809623248557
R2 Score: 0.9989627209319059
RMSE: 0.104143
Entropy Value: 0.0004003501440794584
In [188]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[188]:
feature importance
1 human_development_index 0.973698
2 extreme_poverty 0.023524
3 gdp_per_capita 0.001460
5 population 0.000883
4 population_density 0.000409
0 hospital_beds_per_thousand 0.000027
In [189]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[189]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [12]:
country1 = 'United Kingdom'
country2 = 'Austria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [13]:
df_updated
Out[13]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564
13606 United Kingdom 12/26/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564
13607 United Kingdom 12/27/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564
13608 United Kingdom 12/28/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564
13609 United Kingdom 12/29/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564

2102 rows × 9 columns

In [14]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [15]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [16]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[16]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [17]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [18]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [19]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [20]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [21]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[21]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [22]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [23]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [24]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [25]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.9762513886260628
In [26]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [27]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  8.726735777741874
R2 Score: 0.7733967613328221
RMSE: 2.954105
Entropy Value: 0.027561275886581353
In [28]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[28]:
feature importance
5 median_age 0.949010
1 female_smokers 0.028432
2 male_smokers 0.008241
0 cardiovasc_death_rate 0.005321
3 life_expectancy 0.004751
4 aged_65_older 0.004245
In [29]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[29]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [30]:
country1 = 'United Kingdom'
country2 = 'Austria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [31]:
df_updated
Out[31]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564
13606 United Kingdom 12/26/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564
13607 United Kingdom 12/27/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564
13608 United Kingdom 12/28/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564
13609 United Kingdom 12/29/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564

2102 rows × 9 columns

In [32]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [33]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [34]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[34]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [35]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [36]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [37]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [38]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [39]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[39]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [40]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [41]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [42]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [43]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9836112086552532
In [44]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [45]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  6.845097630533553
R2 Score: 0.8222564161930779
RMSE: 2.616314
Entropy Value: 0.018393087069933763
In [46]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[46]:
feature importance
1 human_development_index 0.898081
2 extreme_poverty 0.045096
5 population 0.034537
4 population_density 0.014076
3 gdp_per_capita 0.007620
0 hospital_beds_per_thousand 0.000590
In [225]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[225]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [226]:
country1 = 'Bulgaria'
country2 = 'Czechia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [227]:
df_updated
Out[227]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919575

2061 rows × 9 columns

In [228]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [229]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [230]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[230]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [231]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [232]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [233]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [234]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [235]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[235]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [236]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [237]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [238]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [239]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9464328946652518
In [240]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [241]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002615323739285408
R2 Score: 0.9984048531108621
RMSE: 0.051140
Entropy Value: 0.0003186984164555006
In [242]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[242]:
feature importance
0 cardiovasc_death_rate 0.829005
5 median_age 0.070605
2 male_smokers 0.032670
3 life_expectancy 0.031136
1 female_smokers 0.024700
4 aged_65_older 0.011884
In [243]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[243]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [244]:
country1 = 'Bulgaria'
country2 = 'Czechia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [245]:
df_updated
Out[245]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919575

2061 rows × 9 columns

In [246]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [247]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [248]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[248]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [249]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [250]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [251]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [252]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [253]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[253]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [254]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [255]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [256]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [257]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9509416306303644
In [258]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [259]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003709674750284998
R2 Score: 0.9977373829294084
RMSE: 0.060907
Entropy Value: 0.0006342010482872406
In [260]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[260]:
feature importance
5 population 0.598261
0 hospital_beds_per_thousand 0.292702
2 extreme_poverty 0.046316
3 gdp_per_capita 0.031321
1 human_development_index 0.026994
4 population_density 0.004406
In [261]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[261]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [262]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [263]:
df_updated
Out[263]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
5188 Denmark 2/3/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
5189 Denmark 2/4/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
5190 Denmark 2/5/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
5191 Denmark 2/6/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159
8372 Finland 12/26/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159
8373 Finland 12/27/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159
8374 Finland 12/28/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159
8375 Finland 12/29/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159

2128 rows × 9 columns

In [264]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [265]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [266]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[266]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [267]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [268]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [269]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [270]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [271]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[271]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [272]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [273]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [274]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [275]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989025431448132
In [276]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [277]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008557398987995634
R2 Score: 0.9951455562253472
RMSE: 0.092506
Entropy Value: 0.001751932122670549
In [278]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[278]:
feature importance
1 female_smokers 0.955989
2 male_smokers 0.028413
3 life_expectancy 0.006226
5 median_age 0.005439
0 cardiovasc_death_rate 0.003622
4 aged_65_older 0.000312
In [279]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[279]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [280]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [281]:
df_updated
Out[281]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5188 Denmark 2/3/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5189 Denmark 2/4/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5190 Denmark 2/5/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5191 Denmark 2/6/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2128 rows × 9 columns

In [282]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [283]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [284]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[284]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [285]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [286]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [287]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [288]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [289]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[289]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [290]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [291]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [292]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [293]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989156833724155
In [294]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [295]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008329520038950185
R2 Score: 0.995274827461516
RMSE: 0.091266
Entropy Value: 0.0016508456649447665
In [296]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[296]:
feature importance
1 human_development_index 0.956574
2 extreme_poverty 0.028531
5 population 0.008398
3 gdp_per_capita 0.006081
4 population_density 0.000371
0 hospital_beds_per_thousand 0.000044
In [297]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[297]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [298]:
country1 = 'Switzerland'
country2 = 'Canada'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [299]:
df_updated
Out[299]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.093162

2111 rows × 9 columns

In [300]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [301]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [302]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[302]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [303]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [304]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [305]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [306]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [307]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[307]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [308]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [309]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [310]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [311]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.999255499783543
In [312]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [313]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0057175049217638016
R2 Score: 0.9982582431748896
RMSE: 0.075614
Entropy Value: 0.00090492426720773
In [314]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[314]:
feature importance
1 female_smokers 0.853151
0 cardiovasc_death_rate 0.096198
5 median_age 0.024493
2 male_smokers 0.024408
3 life_expectancy 0.001513
4 aged_65_older 0.000235
In [315]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[315]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [316]:
country1 = 'Switzerland'
country2 = 'Canada'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [317]:
df_updated
Out[317]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.093162

2111 rows × 9 columns

In [318]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [319]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [320]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[320]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [321]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [322]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [323]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [324]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [325]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[325]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [326]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [327]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [328]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [329]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9990269951754802
In [330]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [331]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007219604675773637
R2 Score: 0.9978006497780595
RMSE: 0.084968
Entropy Value: 0.0013001589537891883
In [332]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[332]:
feature importance
1 human_development_index 0.940026
5 population 0.034436
2 extreme_poverty 0.023629
3 gdp_per_capita 0.001447
4 population_density 0.000316
0 hospital_beds_per_thousand 0.000146
In [333]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[333]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [334]:
country1 = 'Cyprus'
country2 = 'Portugal'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [335]:
df_updated
Out[335]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977
11514 Portugal 12/26/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977
11515 Portugal 12/27/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977
11516 Portugal 12/28/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977
11517 Portugal 12/29/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977

2061 rows × 9 columns

In [336]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [337]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [338]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[338]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [339]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [340]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [341]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [342]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [343]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[343]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [344]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [345]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [346]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [347]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9954906642741526
In [348]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [349]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.000776118438549149
R2 Score: 0.9992061430700794
RMSE: 0.027859
Entropy Value: inf
In [350]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[350]:
feature importance
1 female_smokers 0.627077
0 cardiovasc_death_rate 0.278172
5 median_age 0.066527
2 male_smokers 0.024780
3 life_expectancy 0.002661
4 aged_65_older 0.000783
In [351]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[351]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [352]:
country1 = 'Cyprus'
country2 = 'Portugal'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [353]:
df_updated
Out[353]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977
11514 Portugal 12/26/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977
11515 Portugal 12/27/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977
11516 Portugal 12/28/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977
11517 Portugal 12/29/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977

2061 rows × 9 columns

In [354]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [355]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [356]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[356]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [357]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [358]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [359]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [360]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [361]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[361]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [362]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [363]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [364]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [365]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9959306509224998
In [366]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [367]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0011340569012311748
R2 Score: 0.9988400237834194
RMSE: 0.033676
Entropy Value: inf
In [368]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[368]:
feature importance
1 human_development_index 0.774123
5 population 0.191199
2 extreme_poverty 0.031279
3 gdp_per_capita 0.002127
4 population_density 0.001223
0 hospital_beds_per_thousand 0.000049
In [369]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[369]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [370]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [371]:
df_updated
Out[371]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.036403

2076 rows × 9 columns

In [372]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [373]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [374]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[374]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [375]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [376]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [377]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [378]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [379]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[379]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [380]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [381]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [382]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [383]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9966830298813072
In [384]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [385]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002114226351778038
R2 Score: 0.9987690796288725
RMSE: 0.045981
Entropy Value: 0.00023982543523329205
In [386]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[386]:
feature importance
0 cardiovasc_death_rate 0.735038
1 female_smokers 0.152939
5 median_age 0.086766
2 male_smokers 0.022275
3 life_expectancy 0.002292
4 aged_65_older 0.000690
In [387]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[387]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [388]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [389]:
df_updated
Out[389]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 9 columns

In [390]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [391]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [392]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[392]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [393]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [394]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [395]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [396]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [397]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[397]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [398]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [399]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [400]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [401]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9942151184753699
In [402]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [403]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003107698404524249
R2 Score: 0.9981906718406797
RMSE: 0.055747
Entropy Value: 0.0004506715934095217
In [404]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[404]:
feature importance
5 population 0.753233
1 human_development_index 0.216832
2 extreme_poverty 0.021982
3 gdp_per_capita 0.007136
4 population_density 0.000814
0 hospital_beds_per_thousand 0.000003
In [405]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[405]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [406]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [407]:
df_updated
Out[407]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.536669

2091 rows × 9 columns

In [408]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [409]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [410]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[410]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [411]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [412]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [413]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [414]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [415]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[415]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [416]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [417]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [418]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [419]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974452198310478
In [420]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [421]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005940423882190219
R2 Score: 0.9970855058116366
RMSE: 0.077074
Entropy Value: 0.0007644065172632581
In [422]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[422]:
feature importance
1 female_smokers 0.641126
0 cardiovasc_death_rate 0.314675
2 male_smokers 0.022687
5 median_age 0.017803
3 life_expectancy 0.003272
4 aged_65_older 0.000437
In [423]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[423]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [424]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [425]:
df_updated
Out[425]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2091 rows × 9 columns

In [426]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [427]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [428]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[428]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [429]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [430]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [431]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [432]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [433]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[433]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [434]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [435]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [436]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [437]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9967388889181926
In [438]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [439]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010420033179699867
R2 Score: 0.9948877173166316
RMSE: 0.102079
Entropy Value: 0.0011192503090849918
In [440]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[440]:
feature importance
1 human_development_index 0.891315
5 population 0.082122
2 extreme_poverty 0.022988
3 gdp_per_capita 0.002865
4 population_density 0.000641
0 hospital_beds_per_thousand 0.000069
In [441]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[441]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [442]:
country1 = 'Spain'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [443]:
df_updated
Out[443]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
24074 Spain 2/1/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
24075 Spain 2/2/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
24076 Spain 2/3/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
24077 Spain 2/4/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
24078 Spain 2/5/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 9 columns

In [444]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [445]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [446]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[446]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [447]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [448]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [449]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [450]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [451]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[451]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [452]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [453]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [454]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [455]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9979863431205297
In [456]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [457]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.07209044944386439
R2 Score: 0.9878655952342997
RMSE: 0.268497
Entropy Value: 0.002601145529676064
In [458]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[458]:
feature importance
1 female_smokers 0.915632
0 cardiovasc_death_rate 0.053216
2 male_smokers 0.026576
3 life_expectancy 0.002459
5 median_age 0.001546
4 aged_65_older 0.000572
In [459]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[459]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [460]:
country1 = 'Spain'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [461]:
df_updated
Out[461]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
24074 Spain 2/1/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
24075 Spain 2/2/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
24076 Spain 2/3/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
24077 Spain 2/4/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
24078 Spain 2/5/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 9 columns

In [462]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [463]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [464]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[464]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [465]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [466]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [467]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [468]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [469]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[469]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [470]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [471]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [472]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [473]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9974172303447238
In [474]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [475]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.069094695635595
R2 Score: 0.9883698463461787
RMSE: 0.262859
Entropy Value: 0.002484362767590676
In [476]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[476]:
feature importance
1 human_development_index 0.954547
2 extreme_poverty 0.026867
5 population 0.012769
3 gdp_per_capita 0.003674
0 hospital_beds_per_thousand 0.001522
4 population_density 0.000621
In [46]:
# Country Pair by Pair Analysis relative to median age
In [47]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[47]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [48]:
# Showing the pairings of countries based on median age (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Italy = df[(df.location == "Italy")]

df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Spain = df[(df.location == "Spain")]
df_Austria = df[(df.location == "Austria")]

df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]

df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]

df_Estonia = df[(df.location == "Estonia")]
df_Finland = df[(df.location == "Finland")]

df_France = df[(df.location == "France")]
df_Latvia = df[(df.location == "Latvia")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Romania = df[(df.location == "Romania")]

df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
In [49]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [50]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [51]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[51]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [10]:
country1 = 'Bulgaria'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [11]:
df_updated
Out[11]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 14.285714
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109

2091 rows × 9 columns

In [12]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [13]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [14]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[14]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [15]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [16]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [17]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [18]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [19]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[19]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [20]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [21]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [22]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [23]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9897461070578887
In [24]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [25]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.014757882118008693
R2 Score: 0.9986128106037364
RMSE: 0.121482
Entropy Value: 0.000395927702763076
In [26]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[26]:
feature importance
1 diabetes_prevalence 0.529929
5 aged_65_older 0.436215
2 female_smokers 0.025303
0 cardiovasc_death_rate 0.004177
3 male_smokers 0.002378
4 life_expectancy 0.001999
In [27]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[27]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [28]:
country1 = 'Bulgaria'
country2 = 'Italy'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [29]:
df_updated
Out[29]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109

2091 rows × 9 columns

In [30]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [31]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [32]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[32]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [33]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [34]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [35]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [36]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [37]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[37]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [38]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [39]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [40]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [41]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9908303413224842
In [42]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [43]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.02496651758267187
R2 Score: 0.9976532345105231
RMSE: 0.158008
Entropy Value: 0.0005803326097664843
In [44]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[44]:
feature importance
1 human_development_index 0.873019
5 population 0.088615
2 extreme_poverty 0.032714
4 population_density 0.003001
3 gdp_per_capita 0.002228
0 hospital_beds_per_thousand 0.000424
In [45]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[45]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [46]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [47]:
df_updated
Out[47]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.536669

2096 rows × 9 columns

In [48]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [49]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [50]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[50]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [51]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [52]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [53]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [54]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [55]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[55]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [56]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [57]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [58]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [60]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983721604192913
In [61]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [62]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0030370867428418185
R2 Score: 0.998479699547482
RMSE: 0.055110
Entropy Value: 0.0004092010063048637
In [63]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[63]:
feature importance
1 diabetes_prevalence 0.936569
0 cardiovasc_death_rate 0.030037
2 female_smokers 0.027445
5 aged_65_older 0.004581
3 male_smokers 0.001084
4 life_expectancy 0.000284
In [64]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[64]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [65]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [66]:
df_updated
Out[66]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2096 rows × 9 columns

In [67]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [68]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [69]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[69]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [70]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [71]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [72]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [73]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [74]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[74]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [75]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [76]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [77]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [78]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.998167144839121
In [79]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [80]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038490153568325174
R2 Score: 0.9980732655059875
RMSE: 0.062040
Entropy Value: 0.000510878761187502
In [81]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[81]:
feature importance
1 human_development_index 0.964288
2 extreme_poverty 0.029185
5 population 0.004776
3 gdp_per_capita 0.001256
4 population_density 0.000316
0 hospital_beds_per_thousand 0.000179
In [82]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[82]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [83]:
country1 = 'Spain'
country2 = 'Austria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [84]:
df_updated
Out[84]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148

2102 rows × 9 columns

In [85]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [86]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [87]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[87]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [88]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [89]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [90]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [91]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [92]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[92]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [93]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [94]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [95]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [96]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9985662954702006
In [97]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [98]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004783382509163117
R2 Score: 0.9991495580176508
RMSE: 0.069162
Entropy Value: 0.00032436366709763954
In [99]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[99]:
feature importance
0 cardiovasc_death_rate 0.505889
1 diabetes_prevalence 0.432103
2 female_smokers 0.042053
5 aged_65_older 0.016378
3 male_smokers 0.003240
4 life_expectancy 0.000337
In [100]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[100]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [101]:
country1 = 'Spain'
country2 = 'Austria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [102]:
df_updated
Out[102]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148

2102 rows × 9 columns

In [103]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [104]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [105]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[105]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [106]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [107]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [108]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [109]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [110]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[110]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [111]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [112]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [113]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [114]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975828020530777
In [115]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [116]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005858624459701179
R2 Score: 0.9989583897608432
RMSE: 0.076542
Entropy Value: 0.00046284689785761193
In [117]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[117]:
feature importance
1 human_development_index 0.926755
2 extreme_poverty 0.038959
5 population 0.029970
3 gdp_per_capita 0.003855
4 population_density 0.000348
0 hospital_beds_per_thousand 0.000113
In [118]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[118]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [119]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [120]:
df_updated
Out[120]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.093162

2132 rows × 9 columns

In [121]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [122]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [123]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[123]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [124]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [125]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [126]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [127]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [128]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[128]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [129]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [130]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [131]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [132]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.997355164620372
In [133]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [134]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.030061431635492165
R2 Score: 0.997726838354729
RMSE: 0.173382
Entropy Value: 0.0006262375647953363
In [135]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[135]:
feature importance
1 diabetes_prevalence 0.905479
0 cardiovasc_death_rate 0.059714
2 female_smokers 0.024466
5 aged_65_older 0.007487
3 male_smokers 0.002440
4 life_expectancy 0.000414
In [136]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[136]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [137]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [138]:
df_updated
Out[138]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.093162

2132 rows × 9 columns

In [139]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [140]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [141]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[141]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [142]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [143]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [144]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [145]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [146]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[146]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [147]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [148]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [149]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [150]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975469171045417
In [151]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [152]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03171734845754228
R2 Score: 0.9976016225415473
RMSE: 0.178094
Entropy Value: 0.0007572895754231443
In [153]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[153]:
feature importance
1 human_development_index 0.965414
2 extreme_poverty 0.021985
5 population 0.007566
0 hospital_beds_per_thousand 0.002465
3 gdp_per_capita 0.001925
4 population_density 0.000644
In [154]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[154]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [155]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [156]:
df_updated
Out[156]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.227772
6245 Denmark 12/26/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.227772
6246 Denmark 12/27/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.228905
6247 Denmark 12/28/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.229131
6248 Denmark 12/29/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.229131

2096 rows × 9 columns

In [157]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [158]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [159]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[159]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [160]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [161]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [162]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [163]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [164]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[164]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [165]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [166]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [167]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [168]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9987729138287603
In [169]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [170]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003088227358667578
R2 Score: 0.9974360743157676
RMSE: 0.055572
Entropy Value: 0.0007456443001105355
In [171]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[171]:
feature importance
1 diabetes_prevalence 0.959778
2 female_smokers 0.023462
0 cardiovasc_death_rate 0.012141
5 aged_65_older 0.002635
3 male_smokers 0.001706
4 life_expectancy 0.000277
In [172]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[172]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [173]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [174]:
df_updated
Out[174]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.227772
6245 Denmark 12/26/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.227772
6246 Denmark 12/27/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.228905
6247 Denmark 12/28/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.229131
6248 Denmark 12/29/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.229131

2096 rows × 9 columns

In [175]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [176]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [177]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[177]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [178]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [179]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [180]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [181]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [182]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[182]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [183]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [184]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [185]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [186]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984672041343956
In [187]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [188]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0024692245202902934
R2 Score: 0.9979499863732701
RMSE: 0.049691
Entropy Value: 0.0005994641511856433
In [189]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[189]:
feature importance
1 human_development_index 0.967631
2 extreme_poverty 0.025845
5 population 0.003256
3 gdp_per_capita 0.001799
0 hospital_beds_per_thousand 0.001150
4 population_density 0.000319
In [190]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[190]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [191]:
country1 = 'Estonia'
country2 = 'Finland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [192]:
df_updated
Out[192]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159
8372 Finland 12/26/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159
8373 Finland 12/27/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159
8374 Finland 12/28/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159
8375 Finland 12/29/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159

2127 rows × 9 columns

In [193]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [194]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [195]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[195]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [196]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [197]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [198]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [199]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [200]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[200]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [201]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [202]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [203]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [204]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9960406584730894
In [205]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [206]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003447956787565997
R2 Score: 0.9969249632053334
RMSE: 0.058719
Entropy Value: 0.0011003619011085734
In [207]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[207]:
feature importance
1 diabetes_prevalence 0.916769
0 cardiovasc_death_rate 0.046472
2 female_smokers 0.023264
5 aged_65_older 0.010802
3 male_smokers 0.001888
4 life_expectancy 0.000803
In [208]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[208]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [209]:
country1 = 'Estonia'
country2 = 'Finland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [210]:
df_updated
Out[210]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
6250 Estonia 1/18/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
6251 Estonia 2/5/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
6252 Estonia 2/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
6253 Estonia 2/7/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2127 rows × 9 columns

In [211]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [212]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [213]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[213]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [214]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [215]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [216]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [217]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [218]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[218]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [219]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [220]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [221]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [222]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9961020915988987
In [223]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [224]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002827297964652223
R2 Score: 0.9974784935524297
RMSE: 0.053172
Entropy Value: 0.0008687386647983583
In [225]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[225]:
feature importance
1 human_development_index 0.955245
2 extreme_poverty 0.021036
5 population 0.017558
3 gdp_per_capita 0.003311
0 hospital_beds_per_thousand 0.002044
4 population_density 0.000806
In [226]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[226]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [227]:
country1 = 'France'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [228]:
df_updated
Out[228]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
8376 France 1/24/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
8377 France 1/25/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
8378 France 1/26/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
8379 France 1/27/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
8380 France 1/28/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631631
20907 Latvia 12/26/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631631
20908 Latvia 12/27/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631485
20909 Latvia 12/28/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631485
20910 Latvia 12/29/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631969

2109 rows × 9 columns

In [229]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [230]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [231]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[231]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [232]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [233]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [234]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [235]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [236]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[236]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [237]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [238]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [239]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [240]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9898387697173525
In [241]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [242]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.12061782814064277
R2 Score: 0.9894304331066966
RMSE: 0.347301
Entropy Value: 0.002083591439709351
In [243]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[243]:
feature importance
0 cardiovasc_death_rate 0.576558
1 diabetes_prevalence 0.393405
2 female_smokers 0.019743
3 male_smokers 0.006207
5 aged_65_older 0.002619
4 life_expectancy 0.001468
In [244]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[244]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [245]:
country1 = 'France'
country2 = 'Latvia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [246]:
df_updated
Out[246]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631969

2109 rows × 9 columns

In [247]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [248]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [249]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[249]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [250]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [251]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [252]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [253]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [254]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[254]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [255]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [256]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [257]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [258]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9912037531723439
In [259]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [260]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.13274985547442472
R2 Score: 0.9883673209910792
RMSE: 0.364349
Entropy Value: 0.0023789841094078866
In [261]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[261]:
feature importance
1 human_development_index 0.949555
2 extreme_poverty 0.021708
5 population 0.010848
3 gdp_per_capita 0.009784
0 hospital_beds_per_thousand 0.006611
4 population_density 0.001495
In [262]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[262]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [263]:
country1 = 'Netherlands'
country2 = 'Romania'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [264]:
df_updated
Out[264]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.036403

2075 rows × 9 columns

In [265]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [266]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [267]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[267]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [268]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [269]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [270]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [271]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [272]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[272]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [273]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [274]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [275]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [276]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9986576037239662
In [277]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [278]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038241030962780373
R2 Score: 0.999504068554037
RMSE: 0.061839
Entropy Value: 0.00011032695266333635
In [279]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[279]:
feature importance
1 diabetes_prevalence 0.956911
2 female_smokers 0.023969
0 cardiovasc_death_rate 0.010972
5 aged_65_older 0.005894
3 male_smokers 0.001919
4 life_expectancy 0.000335
In [280]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[280]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [281]:
country1 = 'Netherlands'
country2 = 'Romania'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [282]:
df_updated
Out[282]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403

2075 rows × 9 columns

In [283]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [284]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [285]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[285]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [286]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [287]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [288]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [289]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [290]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[290]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [291]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [292]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [293]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [294]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9986786040355483
In [295]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [296]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003894529879047989
R2 Score: 0.9994949352081688
RMSE: 0.062406
Entropy Value: 0.00013511012619462034
In [297]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[297]:
feature importance
1 human_development_index 0.966297
2 extreme_poverty 0.024977
5 population 0.006146
3 gdp_per_capita 0.002094
4 population_density 0.000341
0 hospital_beds_per_thousand 0.000146
In [298]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[298]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [299]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [300]:
df_updated
Out[300]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.716205

2067 rows × 9 columns

In [301]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [302]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [303]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[303]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [304]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [305]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [306]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [307]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [308]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[308]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [309]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [310]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [311]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [312]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9926958790865121
In [313]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [314]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0007778872294668714
R2 Score: 0.9968165123804168
RMSE: 0.027891
Entropy Value: 0.000446539171870253
In [315]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[315]:
feature importance
1 diabetes_prevalence 0.871034
5 aged_65_older 0.063863
2 female_smokers 0.029942
0 cardiovasc_death_rate 0.029801
3 male_smokers 0.003667
4 life_expectancy 0.001694
In [316]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[316]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [317]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [318]:
df_updated
Out[318]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716205

2067 rows × 9 columns

In [319]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [320]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [321]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[321]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [322]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [323]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [324]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [325]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [326]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[326]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [327]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [328]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [329]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [330]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9928368516844188
In [331]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [332]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0006865654195289836
R2 Score: 0.9971902450248445
RMSE: 0.026202
Entropy Value: 0.0004341579931648298
In [333]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[333]:
feature importance
1 human_development_index 0.877517
5 population 0.080276
2 extreme_poverty 0.031016
0 hospital_beds_per_thousand 0.005715
3 gdp_per_capita 0.003793
4 population_density 0.001682
In [334]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[334]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [335]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [336]:
df_updated
Out[336]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.816005

2102 rows × 9 columns

In [337]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [338]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [339]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[339]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [340]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [341]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [342]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [343]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [344]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[344]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [345]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [346]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [347]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [348]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9966517027412476
In [349]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [350]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.024355474033570693
R2 Score: 0.9953469269223459
RMSE: 0.156062
Entropy Value: 0.0010087574199164379
In [351]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[351]:
feature importance
1 diabetes_prevalence 0.908712
5 aged_65_older 0.037086
0 cardiovasc_death_rate 0.030881
2 female_smokers 0.019542
3 male_smokers 0.002968
4 life_expectancy 0.000811
In [352]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[352]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [353]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [354]:
df_updated
Out[354]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.816005

2102 rows × 9 columns

In [355]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [356]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [357]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[357]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [358]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [359]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [360]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [361]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [362]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[362]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [363]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [364]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [365]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [366]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9964944545383589
In [367]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [368]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.025985308865571106
R2 Score: 0.9950355496702608
RMSE: 0.161200
Entropy Value: 0.0009864073077925806
In [369]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[369]:
feature importance
1 human_development_index 0.938681
5 population 0.038091
2 extreme_poverty 0.018932
3 gdp_per_capita 0.003068
4 population_density 0.000901
0 hospital_beds_per_thousand 0.000327
In [370]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[370]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [371]:
country1 = 'Cyprus'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [372]:
df_updated
Out[372]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011

2063 rows × 9 columns

In [373]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [374]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [375]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[375]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [376]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [377]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [378]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [379]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [380]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[380]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [381]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [382]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [383]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [384]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9765435752469307
In [385]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [386]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0006815282511744329
R2 Score: 0.9965253284287292
RMSE: 0.026106
Entropy Value: 0.0005097313778928398
In [387]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[387]:
feature importance
1 diabetes_prevalence 0.560563
0 cardiovasc_death_rate 0.384111
2 female_smokers 0.029504
5 aged_65_older 0.010627
4 life_expectancy 0.007998
3 male_smokers 0.007196
In [388]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[388]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [389]:
country1 = 'Cyprus'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [390]:
df_updated
Out[390]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2063 rows × 9 columns

In [391]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [392]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [393]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[393]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [394]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [395]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [396]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [397]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [398]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[398]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [399]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [400]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [401]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [402]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9766546149550385
In [403]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [404]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0006872215957745136
R2 Score: 0.9964963017484805
RMSE: 0.026215
Entropy Value: 0.0005144780719953724
In [405]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[405]:
feature importance
1 human_development_index 0.918759
2 extreme_poverty 0.029885
5 population 0.021024
0 hospital_beds_per_thousand 0.013524
3 gdp_per_capita 0.010039
4 population_density 0.006769
In [406]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[406]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [53]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [54]:
df_updated
Out[54]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388

2076 rows × 9 columns

In [55]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [56]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [57]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[57]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [58]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [59]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [60]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [61]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [62]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[62]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [63]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [64]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [65]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [66]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998042071930531
In [67]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [68]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0021993205246510017
R2 Score: 0.9990368656566249
RMSE: 0.046897
Entropy Value: 0.0004146683405752828
In [69]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[69]:
feature importance
0 cardiovasc_death_rate 0.727137
5 aged_65_older 0.225068
2 female_smokers 0.029176
1 diabetes_prevalence 0.016057
3 male_smokers 0.002199
4 life_expectancy 0.000364
In [70]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[70]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [71]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [72]:
df_updated
Out[72]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388

2076 rows × 9 columns

In [73]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [74]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [75]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[75]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [76]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [77]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [78]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [79]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [80]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[80]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [81]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [82]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [83]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [84]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998351364576265
In [85]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [86]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001882911265242772
R2 Score: 0.9991754287359407
RMSE: 0.043393
Entropy Value: 0.0004605212242426924
In [87]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[87]:
feature importance
5 population 0.523426
1 human_development_index 0.437981
2 extreme_poverty 0.036239
3 gdp_per_capita 0.001743
4 population_density 0.000594
0 hospital_beds_per_thousand 0.000017
In [442]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[442]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [52]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [53]:
df_updated
Out[53]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 22.222222
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.084791

2136 rows × 9 columns

In [54]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [55]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [56]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[56]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [57]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [58]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [59]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [60]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [61]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[61]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [62]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [63]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [64]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [65]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9610949677729353
In [66]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [67]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.619699869346872
R2 Score: 0.9744471130217611
RMSE: 0.787210
Entropy Value: 0.006087083561782794
In [68]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[68]:
feature importance
0 cardiovasc_death_rate 0.778084
1 diabetes_prevalence 0.091493
5 aged_65_older 0.053859
2 female_smokers 0.032591
3 male_smokers 0.028179
4 life_expectancy 0.015794
In [69]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[69]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [70]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [71]:
df_updated
Out[71]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 9 columns

In [72]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [73]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [74]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[74]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [75]:
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [76]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [77]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [78]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [79]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[79]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [80]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [81]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [82]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [83]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9638248976625985
In [84]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [85]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.908396956844949
R2 Score: 0.9625428922647679
RMSE: 0.953099
Entropy Value: 0.007020342279931816
In [86]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[86]:
feature importance
1 human_development_index 0.840735
2 extreme_poverty 0.059561
5 population 0.058444
3 gdp_per_capita 0.032403
4 population_density 0.008650
0 hospital_beds_per_thousand 0.000206
In [13]:
# Country Pair by Pair Analysis relative to population density
In [14]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[14]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [15]:
# Showing the pairings of countries based on population density (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Canada = df[(df.location == "Canada")]

df_Estonia = df[(df.location == "Estonia")]
df_Finland = df[(df.location == "Finland")]

df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]

df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]

df_Serbia = df[(df.location == "Serbia")]
df_Spain = df[(df.location == "Spain")]

df_Sweden = df[(df.location == "Sweden")]
df_UnitedStates = df[(df.location == "United States")]

df_Austria = df[(df.location == "Austria")]
df_Cyprus = df[(df.location == "Cyprus")]

df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]

df_France = df[(df.location == "France")]
df_Portugal = df[(df.location == "Portugal")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Belgium = df[(df.location == "Belgium")]
df_Italy = df[(df.location == "Italy")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
In [16]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [17]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [18]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[18]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [19]:
country1 = 'Bulgaria'
country2 = 'Canada'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [20]:
df_updated
Out[20]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2099 rows × 10 columns

In [21]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [22]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [23]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[23]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [24]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [26]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [27]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [28]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [29]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[29]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [30]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [31]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [32]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [33]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.971114273144166
In [34]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [35]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005813044515929637
R2 Score: 0.9981006045680973
RMSE: 0.076243
Entropy Value: 0.00055910741258311
In [36]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[36]:
feature importance
1 diabetes_prevalence 0.566301
0 cardiovasc_death_rate 0.279674
5 aged_65_older 0.084089
2 female_smokers 0.032316
6 median_age 0.020659
4 life_expectancy 0.010257
3 male_smokers 0.006704
In [37]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[37]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [38]:
country1 = 'Bulgaria'
country2 = 'Canada'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [39]:
df_updated
Out[39]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 6781955 14.285714
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.500 0.929 0.5 44017.591 38454328 1.092509
15717 Canada 12/26/2022 2.500 0.929 0.5 44017.591 38454328 1.092338
15718 Canada 12/27/2022 2.500 0.929 0.5 44017.591 38454328 1.092196
15719 Canada 12/28/2022 2.500 0.929 0.5 44017.591 38454328 1.092321
15720 Canada 12/29/2022 2.500 0.929 0.5 44017.591 38454328 1.093162

2099 rows × 8 columns

In [40]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [41]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [42]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[42]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [43]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [44]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [45]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [46]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [47]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[47]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [48]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [49]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [50]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [51]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9703778977447899
In [52]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [53]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00667871709917659
R2 Score: 0.9978177485628428
RMSE: 0.081723
Entropy Value: 0.0005102562409853038
In [54]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[54]:
feature importance
1 human_development_index 0.877965
0 hospital_beds_per_thousand 0.051438
2 extreme_poverty 0.045893
4 population 0.013181
3 gdp_per_capita 0.011522
In [55]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[55]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [56]:
country1 = 'Estonia'
country2 = 'Finland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [57]:
df_updated
Out[57]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
... ... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8372 Finland 12/26/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8373 Finland 12/27/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8374 Finland 12/28/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8375 Finland 12/29/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159

2127 rows × 10 columns

In [58]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [59]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [60]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[60]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [61]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [62]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [63]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [64]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [65]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[65]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [66]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [67]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [68]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [69]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9960837123353512
In [70]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [71]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032302526563769266
R2 Score: 0.9971191211530697
RMSE: 0.056835
Entropy Value: 0.0010010015339330632
In [72]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[72]:
feature importance
1 diabetes_prevalence 0.916992
0 cardiovasc_death_rate 0.044299
2 female_smokers 0.018955
5 aged_65_older 0.012105
6 median_age 0.005019
3 male_smokers 0.001906
4 life_expectancy 0.000725
In [73]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[73]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [74]:
country1 = 'Estonia'
country2 = 'Finland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [75]:
df_updated
Out[75]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
6250 Estonia 1/18/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
6251 Estonia 2/5/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
6252 Estonia 2/6/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
6253 Estonia 2/7/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 5540745 0.55159

2127 rows × 8 columns

In [76]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [77]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [78]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[78]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [79]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [80]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [81]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [82]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [83]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[83]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [84]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [85]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [86]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [87]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9951415575046922
In [88]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [89]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004361670114229664
R2 Score: 0.9961100741935568
RMSE: 0.066043
Entropy Value: 0.0013594739970544801
In [90]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[90]:
feature importance
1 human_development_index 0.965318
2 extreme_poverty 0.022507
0 hospital_beds_per_thousand 0.006656
3 gdp_per_capita 0.003913
4 population 0.001606
In [91]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[91]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [92]:
country1 = 'Iceland'
country2 = 'Ireland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [93]:
df_updated
Out[93]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
18839 Ireland 3/1/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
18840 Ireland 3/2/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
18841 Ireland 3/3/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
18842 Ireland 3/4/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2071 rows × 10 columns

In [94]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [95]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [96]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[96]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [97]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [98]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [99]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [100]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [101]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[101]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [102]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [103]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [104]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [105]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.998195637112943
In [106]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [107]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0024494180666809937
R2 Score: 0.9990910741405081
RMSE: 0.049492
Entropy Value: 0.0005680141052599645
In [108]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[108]:
feature importance
1 diabetes_prevalence 0.519423
0 cardiovasc_death_rate 0.452298
2 female_smokers 0.018856
5 aged_65_older 0.005584
6 median_age 0.001787
3 male_smokers 0.001295
4 life_expectancy 0.000757
In [109]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[109]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [110]:
country1 = 'Iceland'
country2 = 'Ireland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [111]:
df_updated
Out[111]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.2 46482.958 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.2 46482.958 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.2 46482.958 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.2 46482.958 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.2 46482.958 372903 0.11011

2071 rows × 8 columns

In [112]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [113]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [114]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[114]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [115]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [116]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [117]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [118]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [119]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[119]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [120]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [121]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [122]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [123]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9980815151576168
In [124]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [125]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002755488974254762
R2 Score: 0.9989774978725299
RMSE: 0.052493
Entropy Value: 0.0008580065109373093
In [126]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[126]:
feature importance
1 human_development_index 0.927514
0 hospital_beds_per_thousand 0.047998
2 extreme_poverty 0.021699
3 gdp_per_capita 0.001706
4 population 0.001083
In [127]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[127]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [128]:
country1 = 'Latvia'
country2 = 'Romania'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [129]:
df_updated
Out[129]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
17800 Romania 2/26/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17801 Romania 2/27/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17802 Romania 2/28/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17803 Romania 2/29/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17804 Romania 3/1/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2076 rows × 10 columns

In [130]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [131]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [132]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[132]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [133]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [134]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [135]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [136]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [137]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[137]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [138]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [139]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [140]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [141]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9968109992321585
In [142]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [143]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007447771121020153
R2 Score: 0.9949271611600027
RMSE: 0.086300
Entropy Value: 0.0005423752242841923
In [144]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[144]:
feature importance
0 cardiovasc_death_rate 0.849425
6 median_age 0.056326
1 diabetes_prevalence 0.052684
5 aged_65_older 0.019715
2 female_smokers 0.015182
3 male_smokers 0.006200
4 life_expectancy 0.000468
In [145]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[145]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [146]:
country1 = 'Latvia'
country2 = 'Romania'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [147]:
df_updated
Out[147]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
17800 Romania 2/26/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
17801 Romania 2/27/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
17802 Romania 2/28/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
17803 Romania 2/29/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
17804 Romania 3/1/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.570 0.866 0.7 25063.846 1850654 0.631631
20907 Latvia 12/26/2022 5.570 0.866 0.7 25063.846 1850654 0.631631
20908 Latvia 12/27/2022 5.570 0.866 0.7 25063.846 1850654 0.631485
20909 Latvia 12/28/2022 5.570 0.866 0.7 25063.846 1850654 0.631485
20910 Latvia 12/29/2022 5.570 0.866 0.7 25063.846 1850654 0.631969

2076 rows × 8 columns

In [148]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [149]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [150]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[150]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [151]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [152]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [153]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [154]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [155]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[155]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [156]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [157]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [158]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [159]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9965359334331628
In [160]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [161]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0018681129196820036
R2 Score: 0.9987275876738858
RMSE: 0.043222
Entropy Value: 0.00010830270783841744
In [162]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[162]:
feature importance
1 human_development_index 0.766405
0 hospital_beds_per_thousand 0.193593
2 extreme_poverty 0.023563
3 gdp_per_capita 0.015512
4 population 0.000927
In [163]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[163]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [164]:
country1 = 'Serbia'
country2 = 'Spain'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [165]:
df_updated
Out[165]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2101 rows × 10 columns

In [166]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [167]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [168]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[168]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [169]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [170]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [171]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [172]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [173]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[173]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [174]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [175]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [176]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [177]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9981493153041182
In [178]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [179]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006970410542749719
R2 Score: 0.9988825508533481
RMSE: 0.083489
Entropy Value: 0.00039008698511220404
In [180]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[180]:
feature importance
1 diabetes_prevalence 0.760484
0 cardiovasc_death_rate 0.154926
5 aged_65_older 0.040161
6 median_age 0.027412
2 female_smokers 0.015343
3 male_smokers 0.001440
4 life_expectancy 0.000235
In [181]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[181]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [182]:
country1 = 'Serbia'
country2 = 'Spain'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [183]:
df_updated
Out[183]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.970 0.904 1.00 34272.360 47558632 0.855148
25133 Spain 12/26/2022 2.970 0.904 1.00 34272.360 47558632 0.855148
25134 Spain 12/27/2022 2.970 0.904 1.00 34272.360 47558632 0.855148
25135 Spain 12/28/2022 2.970 0.904 1.00 34272.360 47558632 0.855148
25136 Spain 12/29/2022 2.970 0.904 1.00 34272.360 47558632 0.855148

2101 rows × 8 columns

In [184]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [185]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [186]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[186]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [187]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [188]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [189]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [190]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [191]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[191]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [192]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [193]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [194]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [195]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9979148999700286
In [196]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [197]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013859495294009453
R2 Score: 0.9977781393083903
RMSE: 0.117726
Entropy Value: 0.0012691410818176068
In [198]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[198]:
feature importance
1 human_development_index 0.930072
2 extreme_poverty 0.041182
0 hospital_beds_per_thousand 0.024001
3 gdp_per_capita 0.004329
4 population 0.000417
In [199]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[199]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [200]:
country1 = 'Sweden'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [201]:
df_updated
Out[201]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
23011 Sweden 2/1/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23012 Sweden 2/2/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23013 Sweden 2/3/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23014 Sweden 2/4/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23015 Sweden 2/5/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [202]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [203]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [204]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[204]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [205]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [206]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [207]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [208]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [209]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[209]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [210]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [211]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [212]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [213]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9962437618415235
In [214]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [215]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.08758952328361695
R2 Score: 0.983188541421376
RMSE: 0.295955
Entropy Value: 0.003305972151973518
In [216]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[216]:
feature importance
1 diabetes_prevalence 0.917523
0 cardiovasc_death_rate 0.043925
2 female_smokers 0.026333
6 median_age 0.005418
3 male_smokers 0.003534
5 aged_65_older 0.002001
4 life_expectancy 0.001267
In [217]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[217]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [218]:
country1 = 'Sweden'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [219]:
df_updated
Out[219]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
23011 Sweden 2/1/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
23012 Sweden 2/2/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
23013 Sweden 2/3/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
23014 Sweden 2/4/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
23015 Sweden 2/5/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 338289856 1.084791

2136 rows × 8 columns

In [220]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [221]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [222]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[222]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [223]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [224]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [225]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [226]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [227]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[227]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [228]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [229]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [230]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [231]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9968676791176364
In [232]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [233]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.07653851541761683
R2 Score: 0.9853096120018122
RMSE: 0.276656
Entropy Value: 0.002994321361922591
In [234]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[234]:
feature importance
1 human_development_index 0.961189
2 extreme_poverty 0.031117
3 gdp_per_capita 0.004991
4 population 0.001474
0 hospital_beds_per_thousand 0.001230
In [235]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[235]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [236]:
country1 = 'Austria'
country2 = 'Cyprus'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [237]:
df_updated
Out[237]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
4148 Cyprus 12/25/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679
4149 Cyprus 12/26/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679
4150 Cyprus 12/27/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679
4151 Cyprus 12/28/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679
4152 Cyprus 12/29/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679

2066 rows × 10 columns

In [238]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [239]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [240]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[240]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [241]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [242]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [243]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [244]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [245]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[245]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [246]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [247]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [248]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [249]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9956167792156323
In [250]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [251]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0017959116723188738
R2 Score: 0.9984082585103836
RMSE: 0.042378
Entropy Value: 0.0004481618279649274
In [252]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[252]:
feature importance
1 diabetes_prevalence 0.665281
0 cardiovasc_death_rate 0.297172
2 female_smokers 0.025477
6 median_age 0.006165
5 aged_65_older 0.002852
3 male_smokers 0.001868
4 life_expectancy 0.001184
In [253]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[253]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [254]:
country1 = 'Austria'
country2 = 'Cyprus'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [255]:
df_updated
Out[255]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
... ... ... ... ... ... ... ... ...
4148 Cyprus 12/25/2022 3.40 0.887 0.15 32415.132 896007 0.199679
4149 Cyprus 12/26/2022 3.40 0.887 0.15 32415.132 896007 0.199679
4150 Cyprus 12/27/2022 3.40 0.887 0.15 32415.132 896007 0.199679
4151 Cyprus 12/28/2022 3.40 0.887 0.15 32415.132 896007 0.199679
4152 Cyprus 12/29/2022 3.40 0.887 0.15 32415.132 896007 0.199679

2066 rows × 8 columns

In [256]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [257]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [258]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[258]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [259]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [260]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [261]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [262]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [263]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[263]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [264]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [265]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [266]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [267]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9945135037982247
In [268]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [269]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002590042399429916
R2 Score: 0.9977044094035454
RMSE: 0.050892
Entropy Value: 0.0005194256643318901
In [270]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[270]:
feature importance
1 human_development_index 0.902511
2 extreme_poverty 0.054517
0 hospital_beds_per_thousand 0.030059
3 gdp_per_capita 0.011764
4 population 0.001149
In [271]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[271]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [272]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [273]:
df_updated
Out[273]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.227772
6245 Denmark 12/26/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.227772
6246 Denmark 12/27/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.228905
6247 Denmark 12/28/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.229131
6248 Denmark 12/29/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.229131

2096 rows × 10 columns

In [274]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [275]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [276]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[276]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [277]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [278]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [279]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [280]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [281]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[281]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [282]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [283]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [284]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [285]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9987756521772833
In [286]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [287]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0030527308940722715
R2 Score: 0.9974655443925158
RMSE: 0.055252
Entropy Value: 0.0007719519126651911
In [288]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[288]:
feature importance
1 diabetes_prevalence 0.958274
2 female_smokers 0.023643
0 cardiovasc_death_rate 0.013727
3 male_smokers 0.001553
6 median_age 0.001493
5 aged_65_older 0.001015
4 life_expectancy 0.000294
In [289]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[289]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [290]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [291]:
df_updated
Out[291]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 2.50 0.94 0.2 46682.515 5882259 0.227772
6245 Denmark 12/26/2022 2.50 0.94 0.2 46682.515 5882259 0.227772
6246 Denmark 12/27/2022 2.50 0.94 0.2 46682.515 5882259 0.228905
6247 Denmark 12/28/2022 2.50 0.94 0.2 46682.515 5882259 0.229131
6248 Denmark 12/29/2022 2.50 0.94 0.2 46682.515 5882259 0.229131

2096 rows × 8 columns

In [292]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [293]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [294]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[294]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [295]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [296]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [297]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [298]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [299]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[299]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [300]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [301]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [302]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [303]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985367139047165
In [304]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [305]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002246492034029117
R2 Score: 0.9981349046049655
RMSE: 0.047397
Entropy Value: 0.0005257623846396152
In [306]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[306]:
feature importance
1 human_development_index 0.968816
2 extreme_poverty 0.026316
0 hospital_beds_per_thousand 0.002376
3 gdp_per_capita 0.002136
4 population 0.000355
In [307]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[307]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [308]:
country1 = 'France'
country2 = 'Portugal'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [309]:
df_updated
Out[309]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8377 France 1/25/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8378 France 1/26/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8379 France 1/27/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8380 France 1/28/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11514 Portugal 12/26/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11515 Portugal 12/27/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11516 Portugal 12/28/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11517 Portugal 12/29/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977

2105 rows × 10 columns

In [310]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [311]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [312]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[312]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [313]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [314]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [315]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [316]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [317]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[317]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [318]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [319]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [320]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [321]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9935952287493937
In [322]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [323]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.12998101843122575
R2 Score: 0.9887399453385879
RMSE: 0.360529
Entropy Value: 0.002137830843805542
In [324]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[324]:
feature importance
1 diabetes_prevalence 0.625213
0 cardiovasc_death_rate 0.328925
6 median_age 0.020631
2 female_smokers 0.015474
5 aged_65_older 0.005432
3 male_smokers 0.003226
4 life_expectancy 0.001100
In [325]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[325]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [326]:
country1 = 'France'
country2 = 'Portugal'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [327]:
df_updated
Out[327]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 3.39 0.864 0.50 27936.896 10270857 0.462977
11514 Portugal 12/26/2022 3.39 0.864 0.50 27936.896 10270857 0.462977
11515 Portugal 12/27/2022 3.39 0.864 0.50 27936.896 10270857 0.462977
11516 Portugal 12/28/2022 3.39 0.864 0.50 27936.896 10270857 0.462977
11517 Portugal 12/29/2022 3.39 0.864 0.50 27936.896 10270857 0.462977

2105 rows × 8 columns

In [328]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [329]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [330]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[330]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [331]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [332]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [333]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [334]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [335]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[335]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [336]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [337]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [338]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [339]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9902801713528262
In [340]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [341]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.11164969909464596
R2 Score: 0.9903279591904327
RMSE: 0.334140
Entropy Value: 0.001864422558963011
In [342]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[342]:
feature importance
1 human_development_index 0.956109
2 extreme_poverty 0.023120
0 hospital_beds_per_thousand 0.011819
3 gdp_per_capita 0.006569
4 population 0.002383
In [343]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[343]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [344]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [345]:
df_updated
Out[345]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2091 rows × 10 columns

In [346]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [347]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [348]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[348]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [349]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [350]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [351]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [352]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [353]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[353]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [354]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [355]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [356]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [357]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.997363058447263
In [358]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [359]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004452525636132693
R2 Score: 0.9978154993065472
RMSE: 0.066727
Entropy Value: 0.0005847514751188148
In [360]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[360]:
feature importance
6 median_age 0.864046
1 diabetes_prevalence 0.091568
0 cardiovasc_death_rate 0.032643
5 aged_65_older 0.008213
2 female_smokers 0.002508
3 male_smokers 0.000722
4 life_expectancy 0.000301
In [361]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[361]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [362]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [363]:
df_updated
Out[363]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 2119843 0.536669

2091 rows × 8 columns

In [364]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [365]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [366]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[366]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [367]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [368]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [369]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [370]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [371]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[371]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [372]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [373]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [374]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [375]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9972517500748429
In [376]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [377]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006989074948754362
R2 Score: 0.9965710160210536
RMSE: 0.083601
Entropy Value: 0.0008646162310375566
In [378]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[378]:
feature importance
1 human_development_index 0.929125
0 hospital_beds_per_thousand 0.034955
2 extreme_poverty 0.026035
3 gdp_per_capita 0.009089
4 population 0.000797
In [379]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[379]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [380]:
country1 = 'Belgium'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [381]:
df_updated
Out[381]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2124 rows × 10 columns

In [382]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [383]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [384]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[384]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [385]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [386]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [387]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [388]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [389]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[389]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [390]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [391]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [392]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [393]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9992307049399741
In [394]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [395]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011292162553126187
R2 Score: 0.9994176200453608
RMSE: 0.106265
Entropy Value: 0.0003509709613632872
In [396]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[396]:
feature importance
0 cardiovasc_death_rate 0.955787
2 female_smokers 0.033587
1 diabetes_prevalence 0.007834
5 aged_65_older 0.001161
3 male_smokers 0.000818
6 median_age 0.000596
4 life_expectancy 0.000217
In [397]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[397]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [398]:
country1 = 'Belgium'
country2 = 'Italy'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [399]:
df_updated
Out[399]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 59037472 0.735109

2124 rows × 8 columns

In [400]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [401]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [402]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[402]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [403]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [404]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [405]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [406]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [407]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[407]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [408]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [409]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [410]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [411]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9991794764886219
In [412]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [413]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.022094083557412877
R2 Score: 0.9988605237199321
RMSE: 0.148641
Entropy Value: 0.0005922597227633427
In [414]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[414]:
feature importance
1 human_development_index 0.958815
2 extreme_poverty 0.034811
0 hospital_beds_per_thousand 0.004344
3 gdp_per_capita 0.001683
4 population 0.000347
In [415]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[415]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [416]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [417]:
df_updated
Out[417]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872

2078 rows × 10 columns

In [418]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [419]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [420]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[420]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [421]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [422]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [423]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [424]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [425]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[425]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [426]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [427]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [428]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [429]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9987383944393313
In [430]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [431]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010694640005461605
R2 Score: 0.9986061376968532
RMSE: 0.103415
Entropy Value: 0.001142833048385961
In [432]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[432]:
feature importance
1 diabetes_prevalence 0.778276
0 cardiovasc_death_rate 0.164676
5 aged_65_older 0.037707
2 female_smokers 0.013496
3 male_smokers 0.002915
6 median_age 0.002097
4 life_expectancy 0.000833
In [433]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[433]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [434]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [435]:
df_updated
Out[435]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.2 94277.965 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.2 94277.965 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.2 94277.965 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.2 94277.965 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.2 94277.965 647601 0.377872

2078 rows × 8 columns

In [436]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [437]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [438]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[438]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [439]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [440]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [441]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [442]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [443]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[443]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [444]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [445]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [446]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [447]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985686351034438
In [448]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [449]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006943843587888249
R2 Score: 0.9990949894703176
RMSE: 0.083330
Entropy Value: 0.0007068680961652596
In [450]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[450]:
feature importance
1 human_development_index 0.951584
2 extreme_poverty 0.037349
3 gdp_per_capita 0.007125
0 hospital_beds_per_thousand 0.003327
4 population 0.000615
In [451]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[451]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [452]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [453]:
df_updated
Out[453]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322149

2102 rows × 10 columns

In [454]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [455]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [456]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[456]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [457]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [458]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [459]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [460]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [461]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[461]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [462]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [463]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [464]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [465]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best CV score: 0.9632046791137154
In [466]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [467]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.38500249959391253
R2 Score: 0.9847974633914387
RMSE: 0.620486
Entropy Value: 0.004637356948453654
In [468]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[468]:
feature importance
0 cardiovasc_death_rate 0.817592
5 aged_65_older 0.046701
6 median_age 0.039390
1 diabetes_prevalence 0.033816
2 female_smokers 0.029796
3 male_smokers 0.020359
4 life_expectancy 0.012346
In [469]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[469]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [470]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [471]:
df_updated
Out[471]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.20 39753.244 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.20 39753.244 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.20 39753.244 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.20 39753.244 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.20 39753.244 67508936 22.222222
... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.53 0.955 0.03 57410.166 8740471 0.322922
14645 Switzerland 12/26/2022 4.53 0.955 0.03 57410.166 8740471 0.322922
14646 Switzerland 12/27/2022 4.53 0.955 0.03 57410.166 8740471 0.322922
14647 Switzerland 12/28/2022 4.53 0.955 0.03 57410.166 8740471 0.323082
14648 Switzerland 12/29/2022 4.53 0.955 0.03 57410.166 8740471 0.322149

2102 rows × 8 columns

In [472]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [473]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [474]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[474]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [475]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [476]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [477]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [478]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [479]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[479]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [480]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [481]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [482]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [483]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9540507391434925
In [484]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [485]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.6419846927529824
R2 Score: 0.9351631939309383
RMSE: 1.281400
Entropy Value: 0.008444377761462866
In [486]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[486]:
feature importance
1 human_development_index 0.861841
2 extreme_poverty 0.061051
3 gdp_per_capita 0.049401
4 population 0.022082
0 hospital_beds_per_thousand 0.005626
In [6]:
# Country Pair by Pair Analysis relative to hospital beds per thousand
In [7]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[7]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [8]:
# Showing the pairings of countries based on hospital beds per thousand (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Czechia = df[(df.location == "Czechia")]
df_France = df[(df.location == "France")]

df_Romania = df[(df.location == "Romania")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]

df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Serbia = df[(df.location == "Serbia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Switzerland = df[(df.location == "Switzerland")]
df_Canada = df[(df.location == "Canada")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]

df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Portugal = df[(df.location == "Portugal")]

df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
In [9]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [10]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [11]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[11]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [12]:
country1 = 'Austria'
country2 = 'Bulgaria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [13]:
df_updated
Out[13]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
3121 Bulgaria 12/25/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.949845
3122 Bulgaria 12/26/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.950107
3123 Bulgaria 12/27/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.949883
3124 Bulgaria 12/28/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.949716
3125 Bulgaria 12/29/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.949605

2066 rows × 10 columns

In [14]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [15]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [16]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[16]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [17]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [18]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [19]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [20]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [21]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[21]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [22]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [23]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [24]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [25]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9548694967552779
In [26]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [27]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003026954642565577
R2 Score: 0.9983335176391287
RMSE: 0.055018
Entropy Value: 0.0007808314682988706
In [28]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[28]:
feature importance
0 cardiovasc_death_rate 0.791317
6 median_age 0.093374
1 diabetes_prevalence 0.051696
2 female_smokers 0.025939
3 male_smokers 0.015921
5 aged_65_older 0.012669
4 life_expectancy 0.009084
In [29]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[29]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [30]:
country1 = 'Austria'
country2 = 'Bulgaria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [31]:
df_updated
Out[31]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ...
3121 Bulgaria 12/25/2022 0.816 1.5 18563.307 65.180 6781955 2.949845
3122 Bulgaria 12/26/2022 0.816 1.5 18563.307 65.180 6781955 2.950107
3123 Bulgaria 12/27/2022 0.816 1.5 18563.307 65.180 6781955 2.949883
3124 Bulgaria 12/28/2022 0.816 1.5 18563.307 65.180 6781955 2.949716
3125 Bulgaria 12/29/2022 0.816 1.5 18563.307 65.180 6781955 2.949605

2066 rows × 8 columns

In [32]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [33]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [34]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[34]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [35]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [36]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [37]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [38]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [39]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[39]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [40]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [41]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [42]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [43]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9452216055850846
In [44]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [45]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03325057701492528
R2 Score: 0.9816939774039035
RMSE: 0.182347
Entropy Value: 0.0031574369706604878
In [46]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[46]:
feature importance
1 extreme_poverty 0.616534
0 human_development_index 0.327116
2 gdp_per_capita 0.035922
3 population_density 0.010367
4 population 0.010062
In [47]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[47]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [48]:
country1 = 'Czechia'
country2 = 'France'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [49]:
df_updated
Out[49]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411892

2105 rows × 10 columns

In [50]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [51]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [52]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[52]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [53]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [54]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [55]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [56]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [57]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[57]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [58]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [59]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [60]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [61]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9956602391751492
In [62]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [63]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.125947936501202
R2 Score: 0.9878566963057185
RMSE: 0.354891
Entropy Value: 0.002084045307312263
In [64]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[64]:
feature importance
5 aged_65_older 0.902459
1 diabetes_prevalence 0.079682
2 female_smokers 0.009289
3 male_smokers 0.003096
6 median_age 0.002737
0 cardiovasc_death_rate 0.002483
4 life_expectancy 0.000254
In [65]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[65]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [66]:
country1 = 'Czechia'
country2 = 'France'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [67]:
df_updated
Out[67]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 0.901 0.02 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 0.901 0.02 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 0.901 0.02 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 0.901 0.02 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 0.901 0.02 38605.671 122.578 67813000 0.411892

2105 rows × 8 columns

In [68]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [69]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [70]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[70]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [71]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [72]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [73]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [74]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [75]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[75]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [76]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [77]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [78]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [79]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9939471991048846
In [80]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [81]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.14908620050795524
R2 Score: 0.9856258144461353
RMSE: 0.386117
Entropy Value: 0.0025994670043627145
In [82]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[82]:
feature importance
1 extreme_poverty 0.949247
2 gdp_per_capita 0.024344
0 human_development_index 0.017391
3 population_density 0.007825
4 population 0.001193
In [83]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[83]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [84]:
country1 = 'Romania'
country2 = 'Slovakia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [85]:
df_updated
Out[85]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.036403

2067 rows × 10 columns

In [86]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [87]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [88]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[88]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [89]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [90]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [91]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [92]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [93]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[93]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [94]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [95]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [96]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [97]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975069670951232
In [98]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [99]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001889438938278478
R2 Score: 0.9989346063878206
RMSE: 0.043468
Entropy Value: 0.00017683320239097805
In [100]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[100]:
feature importance
0 cardiovasc_death_rate 0.766856
1 diabetes_prevalence 0.188484
5 aged_65_older 0.019559
2 female_smokers 0.017919
6 median_age 0.003669
3 male_smokers 0.002809
4 life_expectancy 0.000703
In [101]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[101]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [102]:
country1 = 'Romania'
country2 = 'Slovakia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [103]:
df_updated
Out[103]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 0.828 5.7 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 0.828 5.7 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 0.828 5.7 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 0.828 5.7 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 0.828 5.7 23313.199 85.129 19659270 2.036403

2067 rows × 8 columns

In [104]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [105]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [106]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[106]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [107]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [108]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [109]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [110]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [111]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[111]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [112]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [113]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [114]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [115]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9977554150263899
In [116]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [117]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0016264328679556177
R2 Score: 0.9990829070190872
RMSE: 0.040329
Entropy Value: 0.0001602501218235589
In [118]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[118]:
feature importance
1 extreme_poverty 0.623249
0 human_development_index 0.351020
2 gdp_per_capita 0.019462
3 population_density 0.005409
4 population 0.000861
In [119]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[119]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [120]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [121]:
df_updated
Out[121]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.464100
7306 Estonia 12/26/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.464100
7307 Estonia 12/27/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.463645
7308 Estonia 12/28/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.466423
7309 Estonia 12/29/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.466423

2121 rows × 10 columns

In [122]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [123]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [124]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[124]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [125]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [126]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [127]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [128]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [129]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[129]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [130]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [131]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [132]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [133]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984633650339623
In [134]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [135]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01970524664080637
R2 Score: 0.9984388662695985
RMSE: 0.140375
Entropy Value: 0.0008164149340349968
In [136]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[136]:
feature importance
1 diabetes_prevalence 0.725259
0 cardiovasc_death_rate 0.226135
2 female_smokers 0.037629
5 aged_65_older 0.005417
6 median_age 0.002877
3 male_smokers 0.002343
4 life_expectancy 0.000341
In [137]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[137]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [138]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [139]:
df_updated
Out[139]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 0.892 0.5 29481.252 31.033 1326064 0.466423

2121 rows × 8 columns

In [140]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [141]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [142]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[142]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [143]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [144]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [145]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [146]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [147]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[147]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [148]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [149]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [150]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [151]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983140649374975
In [152]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [153]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01836382495651645
R2 Score: 0.9985451394199026
RMSE: 0.135513
Entropy Value: 0.001203144259433017
In [154]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[154]:
feature importance
1 extreme_poverty 0.945679
2 gdp_per_capita 0.038606
0 human_development_index 0.011137
3 population_density 0.003912
4 population 0.000666
In [155]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[155]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [156]:
country1 = 'Latvia'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [157]:
df_updated
Out[157]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2079 rows × 10 columns

In [158]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [159]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [160]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[160]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [161]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [162]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [163]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [164]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [165]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[165]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [166]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [167]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [168]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [169]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983315321322627
In [170]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [171]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0028103373338490153
R2 Score: 0.9929235243700243
RMSE: 0.053013
Entropy Value: 0.0010319225336165856
In [172]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[172]:
feature importance
1 diabetes_prevalence 0.915515
6 median_age 0.050625
2 female_smokers 0.016696
0 cardiovasc_death_rate 0.009803
3 male_smokers 0.003679
5 aged_65_older 0.003224
4 life_expectancy 0.000458
In [173]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[173]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [174]:
country1 = 'Latvia'
country2 = 'Luxembourg'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [175]:
df_updated
Out[175]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 0.866 0.7 25063.846 31.212 1850654 0.631969

2079 rows × 8 columns

In [176]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [177]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [178]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[178]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [179]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [180]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [181]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [182]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [183]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[183]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [184]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [185]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [186]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [187]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9969485269472338
In [188]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [189]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003435249287771037
R2 Score: 0.991349985791738
RMSE: 0.058611
Entropy Value: 0.0013719375765382364
In [190]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[190]:
feature importance
1 extreme_poverty 0.937685
2 gdp_per_capita 0.038266
0 human_development_index 0.019146
3 population_density 0.003910
4 population 0.000993
In [191]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[191]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [192]:
country1 = 'Serbia'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [193]:
df_updated
Out[193]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2100 rows × 10 columns

In [194]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [195]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [196]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[196]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [197]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [198]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [199]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [200]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [201]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[201]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [202]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [203]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [204]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [205]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9949427400565657
In [206]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [207]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010607100132758094
R2 Score: 0.9938721980926469
RMSE: 0.102991
Entropy Value: 0.0015120167964044018
In [208]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[208]:
feature importance
1 diabetes_prevalence 0.794401
0 cardiovasc_death_rate 0.138390
5 aged_65_older 0.041077
2 female_smokers 0.022642
6 median_age 0.001811
3 male_smokers 0.001273
4 life_expectancy 0.000407
In [209]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[209]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [210]:
country1 = 'Serbia'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [211]:
df_updated
Out[211]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 0.917 0.00 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 0.917 0.00 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 0.917 0.00 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 0.917 0.00 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 0.917 0.00 31400.840 102.619 2119843 0.536669

2100 rows × 8 columns

In [212]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [213]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [214]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[214]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [215]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [216]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [217]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [218]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [219]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[219]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [220]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [221]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [222]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [223]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9971862848708511
In [224]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [225]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004953974082252353
R2 Score: 0.9971380517341916
RMSE: 0.070384
Entropy Value: 0.001138633276467043
In [226]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[226]:
feature importance
1 extreme_poverty 0.943705
2 gdp_per_capita 0.028816
0 human_development_index 0.022792
3 population_density 0.004115
4 population 0.000572
In [227]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[227]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [228]:
country1 = 'Switzerland'
country2 = 'Canada'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [229]:
df_updated
Out[229]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2111 rows × 10 columns

In [230]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [231]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [232]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[232]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [233]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [234]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [235]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [236]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [237]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[237]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [238]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [239]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [240]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [241]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9988028205689966
In [242]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [243]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0061427712660212285
R2 Score: 0.9981286918115353
RMSE: 0.078376
Entropy Value: 0.0009086325364058084
In [244]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[244]:
feature importance
1 diabetes_prevalence 0.853362
0 cardiovasc_death_rate 0.097828
2 female_smokers 0.022562
5 aged_65_older 0.021620
6 median_age 0.002997
3 male_smokers 0.001511
4 life_expectancy 0.000120
In [245]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[245]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [246]:
country1 = 'Switzerland'
country2 = 'Canada'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [247]:
df_updated
Out[247]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 0.929 0.50 44017.591 4.037 38454328 1.093162

2111 rows × 8 columns

In [248]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [249]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [250]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[250]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [251]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [252]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [253]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [254]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [255]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[255]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [256]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [257]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [258]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [259]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986226086040209
In [260]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [261]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009274554366257183
R2 Score: 0.9971746384850856
RMSE: 0.096304
Entropy Value: 0.0011833353007308447
In [262]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[262]:
feature importance
1 extreme_poverty 0.960005
2 gdp_per_capita 0.024331
0 human_development_index 0.012512
3 population_density 0.002821
4 population 0.000331
In [263]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[263]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [264]:
country1 = 'Cyprus'
country2 = 'Denmark'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [265]:
df_updated
Out[265]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.227772
6245 Denmark 12/26/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.227772
6246 Denmark 12/27/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.228905
6247 Denmark 12/28/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.229131
6248 Denmark 12/29/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.229131

2089 rows × 10 columns

In [266]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [267]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [268]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[268]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [269]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [270]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [271]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [272]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [273]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[273]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [274]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [275]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [276]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [277]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9977923468629054
In [278]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [279]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001712667679856554
R2 Score: 0.9986011517667411
RMSE: 0.041384
Entropy Value: 0.00040951059147751267
In [280]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[280]:
feature importance
6 median_age 0.849240
1 diabetes_prevalence 0.129051
5 aged_65_older 0.013820
0 cardiovasc_death_rate 0.003271
2 female_smokers 0.002484
3 male_smokers 0.001336
4 life_expectancy 0.000799
In [281]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[281]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [282]:
country1 = 'Cyprus'
country2 = 'Denmark'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [283]:
df_updated
Out[283]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 0.940 0.20 46682.515 136.520 5882259 0.227772
6245 Denmark 12/26/2022 0.940 0.20 46682.515 136.520 5882259 0.227772
6246 Denmark 12/27/2022 0.940 0.20 46682.515 136.520 5882259 0.228905
6247 Denmark 12/28/2022 0.940 0.20 46682.515 136.520 5882259 0.229131
6248 Denmark 12/29/2022 0.940 0.20 46682.515 136.520 5882259 0.229131

2089 rows × 8 columns

In [284]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [285]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [286]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[286]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [287]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [288]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [289]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [290]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [291]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[291]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [292]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [293]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [294]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [295]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9963693732409921
In [296]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [297]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0017673867368454051
R2 Score: 0.9985564591173178
RMSE: 0.042040
Entropy Value: 0.0008057123359739224
In [298]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[298]:
feature importance
1 extreme_poverty 0.930685
2 gdp_per_capita 0.045621
0 human_development_index 0.018531
3 population_density 0.004051
4 population 0.001112
In [299]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[299]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [300]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [301]:
df_updated
Out[301]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2102 rows × 10 columns

In [302]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [303]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [304]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[304]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [305]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [306]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [307]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [308]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [309]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[309]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [310]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [311]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [312]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [313]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9955159123177418
In [314]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [315]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003560841308279542
R2 Score: 0.9970021670932279
RMSE: 0.059673
Entropy Value: 0.0011802071599462333
In [316]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[316]:
feature importance
1 diabetes_prevalence 0.517316
0 cardiovasc_death_rate 0.432531
6 median_age 0.019175
5 aged_65_older 0.014394
2 female_smokers 0.011943
3 male_smokers 0.003277
4 life_expectancy 0.001363
In [317]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[317]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [318]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [319]:
df_updated
Out[319]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
7310 Finland 1/29/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
7311 Finland 1/30/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
7312 Finland 1/31/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
7313 Finland 2/1/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
7314 Finland 2/2/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 0.949 0.20 46482.958 3.404 372903 0.11011

2102 rows × 8 columns

In [320]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [321]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [322]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[322]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [323]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [324]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [325]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [326]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [327]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[327]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [328]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [329]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [330]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [331]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9954917175836794
In [332]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [333]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002995537974182363
R2 Score: 0.9974780897167169
RMSE: 0.054732
Entropy Value: 0.0010015390611765433
In [334]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[334]:
feature importance
1 extreme_poverty 0.940726
0 human_development_index 0.036143
2 gdp_per_capita 0.013851
3 population_density 0.006981
4 population 0.002299
In [335]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[335]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [336]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [337]:
df_updated
Out[337]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2099 rows × 10 columns

In [338]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [339]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [340]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[340]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [341]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [342]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [343]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [344]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [345]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[345]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [346]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [347]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [348]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [349]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989134316621475
In [350]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [351]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007507485729557742
R2 Score: 0.9993832430454774
RMSE: 0.086646
Entropy Value: 0.0003063610193860753
In [352]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[352]:
feature importance
5 aged_65_older 0.839694
1 diabetes_prevalence 0.112669
6 median_age 0.028748
2 female_smokers 0.013838
0 cardiovasc_death_rate 0.003462
3 male_smokers 0.001347
4 life_expectancy 0.000242
In [353]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[353]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [354]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [355]:
df_updated
Out[355]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 0.892 2.0 35220.084 205.859 59037472 0.735109

2099 rows × 8 columns

In [356]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [357]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [358]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[358]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [359]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [360]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [361]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [362]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [363]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[363]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [364]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [365]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [366]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [367]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985178533250438
In [368]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [369]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00967545011446578
R2 Score: 0.9992051398615733
RMSE: 0.098364
Entropy Value: 0.0004210493876806918
In [370]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[370]:
feature importance
1 extreme_poverty 0.943896
2 gdp_per_capita 0.028862
0 human_development_index 0.023004
3 population_density 0.003151
4 population 0.001088
In [371]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[371]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [372]:
country1 = 'Netherlands'
country2 = 'Portugal'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [373]:
df_updated
Out[373]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11514 Portugal 12/26/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11515 Portugal 12/27/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11516 Portugal 12/28/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11517 Portugal 12/29/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977

2071 rows × 10 columns

In [374]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [375]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [376]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[376]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [377]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [378]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [379]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [380]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [381]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[381]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [382]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [383]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [384]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [385]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9987759689263965
In [386]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [387]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006167962396446283
R2 Score: 0.999198720666993
RMSE: 0.078536
Entropy Value: 0.00030928623928154725
In [388]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[388]:
feature importance
0 cardiovasc_death_rate 0.490064
1 diabetes_prevalence 0.465516
2 female_smokers 0.035163
3 male_smokers 0.003769
6 median_age 0.003047
5 aged_65_older 0.001992
4 life_expectancy 0.000448
In [389]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[389]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [390]:
country1 = 'Netherlands'
country2 = 'Portugal'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [391]:
df_updated
Out[391]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 0.864 0.5 27936.896 112.371 10270857 0.462977
11514 Portugal 12/26/2022 0.864 0.5 27936.896 112.371 10270857 0.462977
11515 Portugal 12/27/2022 0.864 0.5 27936.896 112.371 10270857 0.462977
11516 Portugal 12/28/2022 0.864 0.5 27936.896 112.371 10270857 0.462977
11517 Portugal 12/29/2022 0.864 0.5 27936.896 112.371 10270857 0.462977

2071 rows × 8 columns

In [392]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [393]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [394]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[394]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [395]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [396]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [397]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [398]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [399]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[399]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [400]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [401]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [402]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [403]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985701391894292
In [404]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [405]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006396441720377513
R2 Score: 0.9991690389425403
RMSE: 0.079978
Entropy Value: 0.00045399991497102854
In [406]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[406]:
feature importance
1 extreme_poverty 0.954092
2 gdp_per_capita 0.038297
0 human_development_index 0.003655
3 population_density 0.003477
4 population 0.000478
In [407]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[407]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [408]:
country1 = 'Spain'
country2 = 'Sweden'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [409]:
df_updated
Out[409]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
23011 Sweden 2/1/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23012 Sweden 2/2/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23013 Sweden 2/3/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23014 Sweden 2/4/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23015 Sweden 2/5/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2126 rows × 10 columns

In [410]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [411]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [412]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[412]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [413]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [414]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [415]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [416]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [417]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[417]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [418]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [419]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [420]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [421]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984616500795094
In [422]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [423]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.015222412063753327
R2 Score: 0.9982144571626685
RMSE: 0.123379
Entropy Value: 0.0005650826297719659
In [424]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[424]:
feature importance
1 diabetes_prevalence 0.972202
2 female_smokers 0.021973
5 aged_65_older 0.002099
3 male_smokers 0.001720
0 cardiovasc_death_rate 0.001019
6 median_age 0.000697
4 life_expectancy 0.000290
In [425]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[425]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [426]:
country1 = 'Spain'
country2 = 'Sweden'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [427]:
df_updated
Out[427]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
23011 Sweden 2/1/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
23012 Sweden 2/2/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
23013 Sweden 2/3/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
23014 Sweden 2/4/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
23015 Sweden 2/5/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 0.904 1.0 34272.360 93.105 47558632 0.855148

2126 rows × 8 columns

In [428]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [429]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [430]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[430]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [431]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [432]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [433]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [434]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [435]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[435]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [436]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [437]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [438]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [439]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984002473559823
In [440]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [441]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.021481691633920292
R2 Score: 0.9974802626239476
RMSE: 0.146566
Entropy Value: 0.0008034528795503455
In [442]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[442]:
feature importance
1 extreme_poverty 0.973986
2 gdp_per_capita 0.022250
3 population_density 0.002222
0 human_development_index 0.001199
4 population 0.000344
In [443]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[443]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [444]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [445]:
df_updated
Out[445]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [446]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [447]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [448]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[448]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [449]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [450]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [451]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [452]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [453]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[453]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [454]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [455]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [456]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [457]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9639563497859113
In [458]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [459]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.483882915680185
R2 Score: 0.980047429301368
RMSE: 0.695617
Entropy Value: 0.005987538685788008
In [460]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[460]:
feature importance
0 cardiovasc_death_rate 0.770054
1 diabetes_prevalence 0.089675
5 aged_65_older 0.045220
6 median_age 0.031215
2 female_smokers 0.026417
3 male_smokers 0.023605
4 life_expectancy 0.013812
In [461]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[461]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [462]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [463]:
df_updated
Out[463]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 0.926 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 8 columns

In [464]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [465]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [466]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[466]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [467]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [468]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [469]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [470]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [471]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[471]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [472]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [473]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [474]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [475]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9563158884930116
In [476]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [477]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  2.3376129179018905
R2 Score: 0.9036101802748917
RMSE: 1.528925
Entropy Value: 0.011597152974395022
In [478]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[478]:
feature importance
1 extreme_poverty 0.854034
2 gdp_per_capita 0.085807
3 population_density 0.033015
4 population 0.019051
0 human_development_index 0.008094
In [92]:
# Country Pair by Pair Analysis relative to human development index
In [93]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[93]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [94]:
# Showing the pairings of countries based on human development index (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]

df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]

df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]

df_Italy = df[(df.location == "Italy")]
df_Latvia = df[(df.location == "Latvia")]

df_Portugal = df[(df.location == "Portugal")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Spain = df[(df.location == "Spain")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
In [95]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [96]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [97]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[97]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [491]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [492]:
df_updated
Out[492]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2095 Belgium 12/26/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2096 Belgium 12/27/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2097 Belgium 12/28/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2098 Belgium 12/29/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787

2099 rows × 10 columns

In [493]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [494]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [495]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[495]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [496]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [497]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [498]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [499]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [500]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[500]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [501]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [502]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [503]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [504]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985702277201526
In [505]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [506]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.020452761128882797
R2 Score: 0.9982725374621599
RMSE: 0.143013
Entropy Value: 0.0008156180119545589
In [507]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[507]:
feature importance
6 median_age 0.853422
1 diabetes_prevalence 0.089967
0 cardiovasc_death_rate 0.036597
5 aged_65_older 0.011508
3 male_smokers 0.004570
2 female_smokers 0.003805
4 life_expectancy 0.000130
In [508]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[508]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [509]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [510]:
df_updated
Out[510]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.2 42658.576 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.2 42658.576 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.2 42658.576 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.2 42658.576 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.2 42658.576 375.564 11655923 0.711787

2099 rows × 8 columns

In [511]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [512]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [513]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[513]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [514]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [515]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [516]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [517]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [518]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[518]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [519]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [520]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [521]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [522]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974451044142482
In [523]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [524]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010069518672884694
R2 Score: 0.9991495174577224
RMSE: 0.100347
Entropy Value: 0.00034019584039697294
In [525]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[525]:
feature importance
1 extreme_poverty 0.935823
2 gdp_per_capita 0.050241
0 hospital_beds_per_thousand 0.009467
3 population_density 0.003676
4 population 0.000793
In [526]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[526]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [527]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [528]:
df_updated
Out[528]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2134 rows × 10 columns

In [529]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [530]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [531]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[531]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [532]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [533]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [534]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [535]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [536]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[536]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [537]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [538]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [539]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [540]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984540951428933
In [541]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [542]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002748350813295417
R2 Score: 0.9993440760119627
RMSE: 0.052425
Entropy Value: 0.00028602597187348704
In [543]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[543]:
feature importance
1 diabetes_prevalence 0.742827
0 cardiovasc_death_rate 0.187586
6 median_age 0.034101
2 female_smokers 0.019551
5 aged_65_older 0.014214
3 male_smokers 0.001588
4 life_expectancy 0.000133
In [544]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[544]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [545]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [546]:
df_updated
Out[546]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
5188 Denmark 2/3/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
5189 Denmark 2/4/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
5190 Denmark 2/5/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
5191 Denmark 2/6/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.5 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.5 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.5 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.5 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.5 44017.591 4.037 38454328 1.093162

2134 rows × 8 columns

In [547]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [548]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [549]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[549]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [550]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [551]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [552]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [553]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [554]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[554]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [555]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [556]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [557]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [558]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985212702477563
In [559]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [560]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004149548610657574
R2 Score: 0.9990096648287803
RMSE: 0.064417
Entropy Value: 0.0005668080870825645
In [561]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[561]:
feature importance
1 extreme_poverty 0.937978
0 hospital_beds_per_thousand 0.035832
2 gdp_per_capita 0.021979
3 population_density 0.003812
4 population 0.000400
In [562]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[562]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [563]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [564]:
df_updated
Out[564]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2102 rows × 10 columns

In [565]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [566]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [567]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[567]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [568]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [569]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [570]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [571]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [572]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[572]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [573]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [574]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [575]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [576]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9955159123177418
In [577]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [578]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003560841308279542
R2 Score: 0.9970021670932279
RMSE: 0.059673
Entropy Value: 0.0011802071599462333
In [579]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[579]:
feature importance
1 diabetes_prevalence 0.517316
0 cardiovasc_death_rate 0.432531
6 median_age 0.019175
5 aged_65_older 0.014394
2 female_smokers 0.011943
3 male_smokers 0.003277
4 life_expectancy 0.001363
In [580]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[580]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [581]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [582]:
df_updated
Out[582]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
7310 Finland 1/29/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
7311 Finland 1/30/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
7312 Finland 1/31/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
7313 Finland 2/1/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
7314 Finland 2/2/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.20 46482.958 3.404 372903 0.11011

2102 rows × 8 columns

In [583]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [584]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [585]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[585]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [586]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [587]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [588]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [589]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [590]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[590]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [591]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [592]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [593]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [594]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9954917175836794
In [595]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [596]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002995537974182363
R2 Score: 0.9974780897167169
RMSE: 0.054732
Entropy Value: 0.0010015390611765433
In [597]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[597]:
feature importance
1 extreme_poverty 0.940726
0 hospital_beds_per_thousand 0.036143
2 gdp_per_capita 0.013851
3 population_density 0.006981
4 population 0.002299
In [598]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[598]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [599]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [600]:
df_updated
Out[600]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388

2076 rows × 10 columns

In [601]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [602]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [603]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[603]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [604]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [605]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [606]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [607]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [608]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[608]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [609]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [610]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [611]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [612]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9981832621613457
In [613]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [614]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002099363526029487
R2 Score: 0.9990806391844732
RMSE: 0.045819
Entropy Value: 0.0003826774935355706
In [615]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[615]:
feature importance
5 aged_65_older 0.508703
0 cardiovasc_death_rate 0.420926
2 female_smokers 0.028547
6 median_age 0.024134
1 diabetes_prevalence 0.015762
3 male_smokers 0.001625
4 life_expectancy 0.000304
In [616]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[616]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [617]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [618]:
df_updated
Out[618]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.2 67335.293 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.2 67335.293 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.2 67335.293 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.2 67335.293 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.2 67335.293 69.874 5023108 0.491388

2076 rows × 8 columns

In [619]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [620]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [621]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[621]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [622]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [623]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [624]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [625]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [626]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[626]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [627]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [628]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [629]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [630]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9981230977543898
In [631]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [632]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038624036497071725
R2 Score: 0.9983085623212649
RMSE: 0.062148
Entropy Value: 0.0008228738012960621
In [633]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[633]:
feature importance
1 extreme_poverty 0.920449
2 gdp_per_capita 0.043623
0 hospital_beds_per_thousand 0.027483
3 population_density 0.007933
4 population 0.000512
In [634]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[634]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [635]:
country1 = 'Netherlands'
country2 = 'Slovenia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [636]:
df_updated
Out[636]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2099 rows × 10 columns

In [637]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [638]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [639]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[639]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [640]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [641]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [642]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [643]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [644]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[644]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [645]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [646]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [647]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [648]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9986942730831515
In [649]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [650]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003718669729409549
R2 Score: 0.999546121567054
RMSE: 0.060981
Entropy Value: 0.00037704341816569313
In [651]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[651]:
feature importance
1 diabetes_prevalence 0.860422
5 aged_65_older 0.118967
2 female_smokers 0.012216
6 median_age 0.002841
3 male_smokers 0.002676
0 cardiovasc_death_rate 0.002590
4 life_expectancy 0.000288
In [652]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[652]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [653]:
country1 = 'Netherlands'
country2 = 'Slovenia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [654]:
df_updated
Out[654]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.0 31400.840 102.619 2119843 0.536669

2099 rows × 8 columns

In [655]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [656]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [657]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[657]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [658]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [659]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [660]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [661]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [662]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[662]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [663]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [664]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [665]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [666]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9987719337602877
In [667]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [668]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006276054785219446
R2 Score: 0.9992339825479873
RMSE: 0.079222
Entropy Value: 0.0005192822850310188
In [669]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[669]:
feature importance
1 extreme_poverty 0.968319
2 gdp_per_capita 0.027643
0 hospital_beds_per_thousand 0.001991
3 population_density 0.001591
4 population 0.000457
In [670]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[670]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [671]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [672]:
df_updated
Out[672]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2102 rows × 10 columns

In [673]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [674]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [675]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[675]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [676]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [677]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [678]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [679]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [680]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[680]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [681]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [682]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [683]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [684]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9965358172344019
In [685]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [686]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.030651412600525802
R2 Score: 0.9941440982603341
RMSE: 0.175075
Entropy Value: 0.0009864234116361557
In [687]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[687]:
feature importance
1 diabetes_prevalence 0.841516
6 median_age 0.093364
5 aged_65_older 0.036116
0 cardiovasc_death_rate 0.013821
2 female_smokers 0.009758
3 male_smokers 0.004457
4 life_expectancy 0.000968
In [688]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[688]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [689]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [690]:
df_updated
Out[690]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.50 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.50 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.50 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.50 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.50 46949.283 24.718 10549349 0.816005

2102 rows × 8 columns

In [691]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [692]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [693]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[693]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [694]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [695]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [696]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [697]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [698]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[698]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [699]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [700]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [701]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [702]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.996752968199182
In [703]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [704]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.024455205155626165
R2 Score: 0.9953278734562381
RMSE: 0.156382
Entropy Value: 0.0011192911652519264
In [705]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[705]:
feature importance
1 extreme_poverty 0.961161
2 gdp_per_capita 0.020804
0 hospital_beds_per_thousand 0.011661
3 population_density 0.005177
4 population 0.001197
In [706]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[706]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [707]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [708]:
df_updated
Out[708]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [709]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [710]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [711]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[711]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [712]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [713]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [714]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [715]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [716]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[716]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [717]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [718]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [719]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [720]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9639563497859113
In [721]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [722]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.483882915680185
R2 Score: 0.980047429301368
RMSE: 0.695617
Entropy Value: 0.005987538685788008
In [723]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[723]:
feature importance
0 cardiovasc_death_rate 0.770054
1 diabetes_prevalence 0.089675
5 aged_65_older 0.045220
6 median_age 0.031215
2 female_smokers 0.026417
3 male_smokers 0.023605
4 life_expectancy 0.013812
In [724]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[724]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [725]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [726]:
df_updated
Out[726]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 8 columns

In [727]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [728]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [729]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[729]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [730]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [731]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [732]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [733]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [734]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[734]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [735]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [736]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [737]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [738]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9563158884930116
In [739]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [740]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  2.3376129179018905
R2 Score: 0.9036101802748917
RMSE: 1.528925
Entropy Value: 0.011597152974395022
In [741]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[741]:
feature importance
1 extreme_poverty 0.854034
2 gdp_per_capita 0.085807
3 population_density 0.033015
4 population 0.019051
0 hospital_beds_per_thousand 0.008094
In [742]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[742]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [743]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [744]:
df_updated
Out[744]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919575

2061 rows × 10 columns

In [745]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [746]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [747]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[747]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [748]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [749]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [750]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [751]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [752]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[752]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [753]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [754]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [755]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [756]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9931354068766577
In [757]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [758]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00047399699097167984
R2 Score: 0.9991753265149302
RMSE: 0.021771
Entropy Value: 0.00015620474538757492
In [759]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[759]:
feature importance
1 diabetes_prevalence 0.658891
0 cardiovasc_death_rate 0.219210
5 aged_65_older 0.052242
6 median_age 0.044419
2 female_smokers 0.022219
3 male_smokers 0.001522
4 life_expectancy 0.001498
In [760]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[760]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [761]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [762]:
df_updated
Out[762]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.63 0.00 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.63 0.00 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.63 0.00 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.63 0.00 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.63 0.00 32605.906 137.176 10493990 0.919575

2061 rows × 8 columns

In [763]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [764]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [765]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[765]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [766]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [767]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [768]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [769]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [770]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[770]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [771]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [772]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [773]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [774]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9866151912407417
In [775]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [776]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008157893614715191
R2 Score: 0.9858066639953047
RMSE: 0.090321
Entropy Value: 0.002246714925831334
In [777]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[777]:
feature importance
1 extreme_poverty 0.818042
0 hospital_beds_per_thousand 0.113415
2 gdp_per_capita 0.051214
3 population_density 0.013891
4 population 0.003438
In [778]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[778]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [779]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [780]:
df_updated
Out[780]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411892

2132 rows × 10 columns

In [781]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [782]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [783]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[783]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [784]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [785]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [786]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [787]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [788]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[788]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [789]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [790]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [791]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [792]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.992829851903131
In [793]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [794]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.08964626978739043
R2 Score: 0.9906665614728145
RMSE: 0.299410
Entropy Value: 0.005413259877489937
In [795]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[795]:
feature importance
1 diabetes_prevalence 0.795771
0 cardiovasc_death_rate 0.160583
2 female_smokers 0.024566
5 aged_65_older 0.009192
6 median_age 0.007044
3 male_smokers 0.002026
4 life_expectancy 0.000817
In [796]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[796]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [797]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [798]:
df_updated
Out[798]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
6250 Estonia 1/18/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
6251 Estonia 2/5/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
6252 Estonia 2/6/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
6253 Estonia 2/7/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.02 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.02 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.02 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.02 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.02 38605.671 122.578 67813000 0.411892

2132 rows × 8 columns

In [799]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [800]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [801]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[801]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [802]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [803]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [804]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [805]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [806]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[806]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [807]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [808]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [809]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [810]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9922426580613892
In [811]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [812]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.10037550062654846
R2 Score: 0.9895494975200281
RMSE: 0.316821
Entropy Value: 0.005971530997882069
In [813]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[813]:
feature importance
1 extreme_poverty 0.949550
2 gdp_per_capita 0.026829
0 hospital_beds_per_thousand 0.016964
3 population_density 0.005760
4 population 0.000897
In [814]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[814]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [815]:
country1 = 'Italy'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [816]:
df_updated
Out[816]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
19873 Latvia 1/6/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
19874 Latvia 1/18/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
19875 Latvia 2/12/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
19876 Latvia 2/29/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
19877 Latvia 3/1/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2102 rows × 10 columns

In [817]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [818]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [819]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[819]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [820]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [821]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [822]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [823]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [824]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[824]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [825]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [826]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [827]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [828]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9987367076025663
In [829]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [830]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01325682921637842
R2 Score: 0.9988426183663411
RMSE: 0.115138
Entropy Value: 0.0005827591181866116
In [831]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[831]:
feature importance
1 diabetes_prevalence 0.468791
0 cardiovasc_death_rate 0.462287
5 aged_65_older 0.047960
2 female_smokers 0.017033
6 median_age 0.002431
3 male_smokers 0.001293
4 life_expectancy 0.000205
In [832]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[832]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [833]:
country1 = 'Italy'
country2 = 'Latvia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [834]:
df_updated
Out[834]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
19873 Latvia 1/6/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
19874 Latvia 1/18/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
19875 Latvia 2/12/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
19876 Latvia 2/29/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
19877 Latvia 3/1/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 2.0 35220.084 205.859 59037472 0.735109

2102 rows × 8 columns

In [835]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [836]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [837]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[837]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [838]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [839]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [840]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [841]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [842]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[842]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [843]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [844]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [845]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [846]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9987496708836977
In [847]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [848]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012769557121584701
R2 Score: 0.9988851594418807
RMSE: 0.113002
Entropy Value: 0.000472757332466212
In [849]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[849]:
feature importance
1 extreme_poverty 0.947730
2 gdp_per_capita 0.025692
0 hospital_beds_per_thousand 0.023602
3 population_density 0.002542
4 population 0.000433
In [850]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[850]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [851]:
country1 = 'Portugal'
country2 = 'Slovakia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [852]:
df_updated
Out[852]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
12542 Slovakia 12/25/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783216
12543 Slovakia 12/26/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783313
12544 Slovakia 12/27/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783363
12545 Slovakia 12/28/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783459
12546 Slovakia 12/29/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783522

2063 rows × 10 columns

In [853]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [854]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [855]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[855]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [856]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [857]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [858]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [859]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [860]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[860]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [861]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [862]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [863]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [864]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9972097011690918
In [865]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [866]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001539940090586167
R2 Score: 0.9978668126685196
RMSE: 0.039242
Entropy Value: 0.0003056208558480997
In [867]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[867]:
feature importance
1 diabetes_prevalence 0.573738
0 cardiovasc_death_rate 0.369553
6 median_age 0.030975
2 female_smokers 0.018394
5 aged_65_older 0.003914
3 male_smokers 0.002962
4 life_expectancy 0.000464
In [868]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[868]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [869]:
country1 = 'Portugal'
country2 = 'Slovakia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [870]:
df_updated
Out[870]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ...
12542 Slovakia 12/25/2022 5.82 0.7 30155.152 113.128 5643455 0.783216
12543 Slovakia 12/26/2022 5.82 0.7 30155.152 113.128 5643455 0.783313
12544 Slovakia 12/27/2022 5.82 0.7 30155.152 113.128 5643455 0.783363
12545 Slovakia 12/28/2022 5.82 0.7 30155.152 113.128 5643455 0.783459
12546 Slovakia 12/29/2022 5.82 0.7 30155.152 113.128 5643455 0.783522

2063 rows × 8 columns

In [871]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [872]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [873]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[873]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [874]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [875]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [876]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [877]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [878]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[878]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [879]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [880]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [881]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [882]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9978257589564323
In [883]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [884]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0017579514577669621
R2 Score: 0.9975648145002586
RMSE: 0.041928
Entropy Value: 0.00038885161098168246
In [885]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[885]:
feature importance
1 extreme_poverty 0.940339
0 hospital_beds_per_thousand 0.028359
2 gdp_per_capita 0.025732
3 population_density 0.005010
4 population 0.000560
In [886]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[886]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [887]:
country1 = 'Spain'
country2 = 'Bulgaria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [888]:
df_updated
Out[888]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2090 rows × 10 columns

In [889]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [890]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [891]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[891]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [892]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [893]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [894]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [895]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [896]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[896]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [897]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [898]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [899]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [900]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9819133940673177
In [901]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [902]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005371342469960679
R2 Score: 0.9989002795207302
RMSE: 0.073289
Entropy Value: 0.0005215588278580451
In [903]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[903]:
feature importance
5 aged_65_older 0.474280
1 diabetes_prevalence 0.397780
0 cardiovasc_death_rate 0.091164
2 female_smokers 0.019135
6 median_age 0.010034
3 male_smokers 0.005037
4 life_expectancy 0.002570
In [904]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[904]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [905]:
country1 = 'Spain'
country2 = 'Bulgaria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [906]:
df_updated
Out[906]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.970 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.970 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.970 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.970 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.970 1.0 34272.360 93.105 47558632 0.855148

2090 rows × 8 columns

In [907]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [908]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [909]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[909]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [910]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [911]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [912]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [913]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [914]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[914]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [915]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [916]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [917]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [918]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9840422767394642
In [919]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [920]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0071890946885995385
R2 Score: 0.9985281156990684
RMSE: 0.084789
Entropy Value: 0.0006206494126241532
In [921]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[921]:
feature importance
1 extreme_poverty 0.934779
2 gdp_per_capita 0.051476
4 population 0.007491
3 population_density 0.004398
0 hospital_beds_per_thousand 0.001856
In [922]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[922]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [98]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [99]:
df_updated
Out[99]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403

2076 rows × 10 columns

In [100]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [101]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [102]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[102]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [103]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [104]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [105]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [106]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [107]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[107]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [108]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [109]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [110]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [111]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9971039894626419
In [112]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [113]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002022927413440657
R2 Score: 0.9988222346389628
RMSE: 0.044977
Entropy Value: 0.00025466415334854067
In [114]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[114]:
feature importance
0 cardiovasc_death_rate 0.701857
5 aged_65_older 0.104510
1 diabetes_prevalence 0.099736
6 median_age 0.082303
2 female_smokers 0.009089
3 male_smokers 0.002010
4 life_expectancy 0.000495
In [115]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[115]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [116]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [117]:
df_updated
Out[117]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 8 columns

In [118]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [119]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [120]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[120]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [121]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [122]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [123]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [124]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [125]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[125]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [126]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [127]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [128]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [129]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.996697918034125
In [130]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [131]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001983248045491881
R2 Score: 0.9988453363008452
RMSE: 0.044534
Entropy Value: 0.00043708292617609305
In [132]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[132]:
feature importance
1 extreme_poverty 0.659949
0 hospital_beds_per_thousand 0.309256
2 gdp_per_capita 0.022356
3 population_density 0.007392
4 population 0.001047
In [2]:
# Country Pair by Pair Analysis relative to extreme poverty
In [3]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[3]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [133]:
# Showing the pairings of countries based on extreme poverty (13 pairs of countries)
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]

df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Serbia = df[(df.location == "Serbia")]

df_Slovenia = df[(df.location == "Slovenia")]
df_Switzerland = df[(df.location == "Switzerland")]

df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]

df_Estonia = df[(df.location == "Estonia")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Portugal = df[(df.location == "Portugal")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Sweden = df[(df.location == "Sweden")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Italy = df[(df.location == "Italy")]
df_Romania = df[(df.location == "Romania")]

df_Spain = df[(df.location == "Spain")]
df_UnitedStates = df[(df.location == "United States")]
In [134]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [135]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [136]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[136]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [137]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [138]:
df_updated
Out[138]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919575

2061 rows × 10 columns

In [139]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [140]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [141]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[141]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [142]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [143]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [144]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [145]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [146]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[146]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [147]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [148]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [149]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [150]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9931354068766577
In [151]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [152]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00047399699097167984
R2 Score: 0.9991753265149302
RMSE: 0.021771
Entropy Value: 0.00015620474538757492
In [153]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[153]:
feature importance
1 diabetes_prevalence 0.658891
0 cardiovasc_death_rate 0.219210
5 aged_65_older 0.052242
6 median_age 0.044419
2 female_smokers 0.022219
3 male_smokers 0.001522
4 life_expectancy 0.001498
In [154]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[154]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [155]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [156]:
df_updated
Out[156]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.63 0.900 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.63 0.900 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.63 0.900 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.63 0.900 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.63 0.900 32605.906 137.176 10493990 0.919575

2061 rows × 8 columns

In [157]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [158]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [159]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[159]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [160]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [161]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [162]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [163]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [164]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[164]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [165]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [166]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [167]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [168]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9866151912407417
In [169]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [170]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008157893614715191
R2 Score: 0.9858066639953047
RMSE: 0.090321
Entropy Value: 0.002246714925831334
In [171]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[171]:
feature importance
1 human_development_index 0.818042
0 hospital_beds_per_thousand 0.113415
2 gdp_per_capita 0.051214
3 population_density 0.013891
4 population 0.003438
In [172]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[172]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [173]:
country1 = 'Finland'
country2 = 'France'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [174]:
df_updated
Out[174]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411892

2137 rows × 10 columns

In [175]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [176]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [177]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[177]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [178]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [179]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [180]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [181]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [182]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[182]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [183]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [184]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [185]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [186]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9901279701885508
In [187]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [188]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03485209804982485
R2 Score: 0.9965523687852408
RMSE: 0.186687
Entropy Value: 0.0021807550391466923
In [189]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[189]:
feature importance
1 diabetes_prevalence 0.519750
0 cardiovasc_death_rate 0.429350
2 female_smokers 0.034202
5 aged_65_older 0.006474
3 male_smokers 0.004594
6 median_age 0.004512
4 life_expectancy 0.001119
In [190]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[190]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [191]:
country1 = 'Finland'
country2 = 'France'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [192]:
df_updated
Out[192]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
7310 Finland 1/29/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
7311 Finland 1/30/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
7312 Finland 1/31/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
7313 Finland 2/1/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
7314 Finland 2/2/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.901 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.901 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.901 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.901 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.901 38605.671 122.578 67813000 0.411892

2137 rows × 8 columns

In [193]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [194]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [195]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[195]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [196]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [197]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [198]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [199]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [200]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[200]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [201]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [202]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [203]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [204]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9910318902554458
In [205]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [206]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.035576227278599215
R2 Score: 0.9964807366404823
RMSE: 0.188617
Entropy Value: 0.0016675150996863651
In [207]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[207]:
feature importance
1 human_development_index 0.937639
2 gdp_per_capita 0.039984
0 hospital_beds_per_thousand 0.012847
3 population_density 0.007729
4 population 0.001800
In [208]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[208]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [209]:
country1 = 'Netherlands'
country2 = 'Serbia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [210]:
df_updated
Out[210]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716205

2075 rows × 10 columns

In [211]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [212]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [213]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[213]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [214]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [215]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [216]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [217]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [218]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[218]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [219]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [220]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [221]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [222]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9988517103371555
In [223]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [224]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005307061778710648
R2 Score: 0.9992985438334173
RMSE: 0.072850
Entropy Value: 0.00033274960964428744
In [225]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[225]:
feature importance
6 median_age 0.956279
1 diabetes_prevalence 0.033001
5 aged_65_older 0.007184
0 cardiovasc_death_rate 0.001792
2 female_smokers 0.000972
3 male_smokers 0.000687
4 life_expectancy 0.000085
In [226]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[226]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [227]:
country1 = 'Netherlands'
country2 = 'Serbia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [228]:
df_updated
Out[228]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 14048.881 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 14048.881 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 14048.881 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 14048.881 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 14048.881 80.291 6871547 0.716205

2075 rows × 8 columns

In [229]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [230]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [231]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[231]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [232]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [233]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [234]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [235]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [236]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[236]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [237]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [238]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [239]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [240]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998367181995075
In [241]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [242]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007612509437698923
R2 Score: 0.9989938233412576
RMSE: 0.087250
Entropy Value: 0.0006561539548214934
In [243]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[243]:
feature importance
1 human_development_index 0.955071
2 gdp_per_capita 0.039929
3 population_density 0.002952
0 hospital_beds_per_thousand 0.001518
4 population 0.000531
In [244]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[244]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [245]:
country1 = 'Slovenia'
country2 = 'Switzerland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [246]:
df_updated
Out[246]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2101 rows × 10 columns

In [247]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [248]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [249]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[249]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [250]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [251]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [252]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [253]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [254]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[254]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [255]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [256]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [257]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [258]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998589125309279
In [259]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [260]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0035706853382243314
R2 Score: 0.9987381127552114
RMSE: 0.059755
Entropy Value: 0.0005437521483565165
In [261]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[261]:
feature importance
1 diabetes_prevalence 0.962368
2 female_smokers 0.022123
0 cardiovasc_death_rate 0.012033
3 male_smokers 0.001968
6 median_age 0.000829
5 aged_65_older 0.000417
4 life_expectancy 0.000261
In [262]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[262]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [263]:
country1 = 'Slovenia'
country2 = 'Switzerland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [264]:
df_updated
Out[264]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 31400.840 102.619 2119843 0.536669

2101 rows × 8 columns

In [265]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [266]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [267]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[267]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [268]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [269]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [270]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [271]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [272]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[272]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [273]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [274]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [275]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [276]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984365005501525
In [277]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [278]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0053398329862075255
R2 Score: 0.9981128924852428
RMSE: 0.073074
Entropy Value: 0.0005566431428928525
In [279]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[279]:
feature importance
1 human_development_index 0.973207
2 gdp_per_capita 0.022780
3 population_density 0.002330
0 hospital_beds_per_thousand 0.001255
4 population 0.000427
In [280]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[280]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [281]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [282]:
df_updated
Out[282]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2095 Belgium 12/26/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2096 Belgium 12/27/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2097 Belgium 12/28/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2098 Belgium 12/29/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787

2099 rows × 10 columns

In [283]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [284]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [285]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[285]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [286]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [287]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [288]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [289]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [290]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[290]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [291]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [292]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [293]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [294]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985702277201526
In [295]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [296]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.020452761128882797
R2 Score: 0.9982725374621599
RMSE: 0.143013
Entropy Value: 0.0008156180119545589
In [297]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[297]:
feature importance
6 median_age 0.853422
1 diabetes_prevalence 0.089967
0 cardiovasc_death_rate 0.036597
5 aged_65_older 0.011508
3 male_smokers 0.004570
2 female_smokers 0.003805
4 life_expectancy 0.000130
In [298]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[298]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [299]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [300]:
df_updated
Out[300]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.931 42658.576 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.931 42658.576 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.931 42658.576 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.931 42658.576 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.931 42658.576 375.564 11655923 0.711787

2099 rows × 8 columns

In [301]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [302]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [303]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[303]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [304]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [305]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [306]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [307]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [308]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[308]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [309]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [310]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [311]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [312]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974451044142482
In [313]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [314]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010069518672884694
R2 Score: 0.9991495174577224
RMSE: 0.100347
Entropy Value: 0.00034019584039697294
In [315]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[315]:
feature importance
1 human_development_index 0.935823
2 gdp_per_capita 0.050241
0 hospital_beds_per_thousand 0.009467
3 population_density 0.003676
4 population 0.000793
In [316]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[316]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [317]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [318]:
df_updated
Out[318]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2134 rows × 10 columns

In [319]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [320]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [321]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[321]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [322]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [323]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [324]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [325]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [326]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[326]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [327]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [328]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [329]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [330]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984540951428933
In [331]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [332]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002748350813295417
R2 Score: 0.9993440760119627
RMSE: 0.052425
Entropy Value: 0.00028602597187348704
In [333]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[333]:
feature importance
1 diabetes_prevalence 0.742827
0 cardiovasc_death_rate 0.187586
6 median_age 0.034101
2 female_smokers 0.019551
5 aged_65_older 0.014214
3 male_smokers 0.001588
4 life_expectancy 0.000133
In [334]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[334]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [335]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [336]:
df_updated
Out[336]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
5188 Denmark 2/3/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
5189 Denmark 2/4/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
5190 Denmark 2/5/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
5191 Denmark 2/6/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 44017.591 4.037 38454328 1.093162

2134 rows × 8 columns

In [337]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [338]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [339]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[339]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [340]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [341]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [342]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [343]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [344]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[344]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [345]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [346]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [347]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [348]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985212702477563
In [349]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [350]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004149548610657574
R2 Score: 0.9990096648287803
RMSE: 0.064417
Entropy Value: 0.0005668080870825645
In [351]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[351]:
feature importance
1 human_development_index 0.937978
0 hospital_beds_per_thousand 0.035832
2 gdp_per_capita 0.021979
3 population_density 0.003812
4 population 0.000400
In [352]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[352]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [353]:
country1 = 'Estonia'
country2 = 'Iceland'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [354]:
df_updated
Out[354]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2097 rows × 10 columns

In [355]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [356]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [357]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[357]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [358]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [359]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [360]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [361]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [362]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[362]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [363]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [364]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [365]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [366]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984258099692642
In [367]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [368]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015282099631709807
R2 Score: 0.9971614614419819
RMSE: 0.039092
Entropy Value: 0.0011429224525441673
In [369]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[369]:
feature importance
1 diabetes_prevalence 0.563638
0 cardiovasc_death_rate 0.377660
6 median_age 0.054616
5 aged_65_older 0.002951
2 female_smokers 0.000807
3 male_smokers 0.000252
4 life_expectancy 0.000076
In [370]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[370]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [371]:
country1 = 'Estonia'
country2 = 'Iceland'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [372]:
df_updated
Out[372]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
6250 Estonia 1/18/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
6251 Estonia 2/5/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
6252 Estonia 2/6/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
6253 Estonia 2/7/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 46482.958 3.404 372903 0.11011

2097 rows × 8 columns

In [373]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [374]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [375]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[375]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [376]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [377]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [378]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [379]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [380]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[380]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [381]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [382]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [383]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [384]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9967186075484848
In [385]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [386]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003177612198314735
R2 Score: 0.9940978170770269
RMSE: 0.056370
Entropy Value: 0.001954468736081089
In [387]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[387]:
feature importance
1 human_development_index 0.915291
0 hospital_beds_per_thousand 0.046809
2 gdp_per_capita 0.035884
3 population_density 0.001125
4 population 0.000891
In [388]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[388]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [389]:
country1 = 'Ireland'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [390]:
df_updated
Out[390]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2073 rows × 10 columns

In [391]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [392]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [393]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[393]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [394]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [395]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [396]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [397]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [398]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[398]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [399]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [400]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [401]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [402]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9980552037491671
In [403]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [404]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032364660760083627
R2 Score: 0.9985316671525546
RMSE: 0.056890
Entropy Value: 0.0004855826815172552
In [405]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[405]:
feature importance
1 diabetes_prevalence 0.739516
0 cardiovasc_death_rate 0.219378
2 female_smokers 0.032388
5 aged_65_older 0.003260
3 male_smokers 0.002877
6 median_age 0.002109
4 life_expectancy 0.000472
In [406]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[406]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [407]:
country1 = 'Ireland'
country2 = 'Latvia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [408]:
df_updated
Out[408]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 25063.846 31.212 1850654 0.631969

2073 rows × 8 columns

In [409]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [410]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [411]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[411]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [412]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [413]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [414]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [415]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [416]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[416]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [417]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [418]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [419]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [420]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9977520979997934
In [421]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [422]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006423810097061833
R2 Score: 0.9970856201950677
RMSE: 0.080149
Entropy Value: 0.0011599657894503636
In [423]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[423]:
feature importance
1 human_development_index 0.948187
2 gdp_per_capita 0.038317
0 hospital_beds_per_thousand 0.009184
3 population_density 0.003609
4 population 0.000702
In [424]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[424]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [425]:
country1 = 'Luxembourg'
country2 = 'Portugal'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [426]:
df_updated
Out[426]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872

2075 rows × 10 columns

In [427]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [428]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [429]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[429]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [430]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [431]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [432]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [433]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [434]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[434]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [435]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [436]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [437]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [438]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9965116751526303
In [439]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [440]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001917842434237941
R2 Score: 0.9976831679134813
RMSE: 0.043793
Entropy Value: 0.00042605878052921417
In [441]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[441]:
feature importance
1 diabetes_prevalence 0.772730
0 cardiovasc_death_rate 0.186688
2 female_smokers 0.025317
5 aged_65_older 0.008044
6 median_age 0.004079
3 male_smokers 0.002494
4 life_expectancy 0.000648
In [442]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[442]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [443]:
country1 = 'Luxembourg'
country2 = 'Portugal'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [444]:
df_updated
Out[444]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 94277.965 231.447 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 94277.965 231.447 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 94277.965 231.447 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 94277.965 231.447 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 94277.965 231.447 647601 0.377872

2075 rows × 8 columns

In [445]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [446]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [447]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[447]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [448]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [449]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [450]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [451]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [452]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[452]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [453]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [454]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [455]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [456]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9970475062842402
In [457]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [458]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001444462921026543
R2 Score: 0.9982550297232574
RMSE: 0.038006
Entropy Value: 0.00038138798942089766
In [459]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[459]:
feature importance
1 human_development_index 0.915538
0 hospital_beds_per_thousand 0.041139
2 gdp_per_capita 0.037230
3 population_density 0.005518
4 population 0.000575
In [460]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[460]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [461]:
country1 = 'Slovakia'
country2 = 'Sweden'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [462]:
df_updated
Out[462]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2092 rows × 10 columns

In [463]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [464]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [465]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[465]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [466]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [467]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [468]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [469]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [470]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[470]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [471]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [472]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [473]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [474]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9956178650087402
In [475]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [476]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.028400997059026503
R2 Score: 0.993865518212125
RMSE: 0.168526
Entropy Value: 0.001669275065011816
In [477]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[477]:
feature importance
1 diabetes_prevalence 0.547703
0 cardiovasc_death_rate 0.411195
5 aged_65_older 0.017001
6 median_age 0.011947
2 female_smokers 0.008400
3 male_smokers 0.003192
4 life_expectancy 0.000561
In [478]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[478]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [479]:
country1 = 'Slovakia'
country2 = 'Sweden'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [480]:
df_updated
Out[480]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 46949.283 24.718 10549349 0.816005

2092 rows × 8 columns

In [481]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [482]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [483]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[483]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [484]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [485]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [486]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [487]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [488]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[488]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [489]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [490]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [491]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [492]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9937432736131026
In [493]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [494]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.043275743993950025
R2 Score: 0.990652642833775
RMSE: 0.208028
Entropy Value: 0.0026706049946896467
In [495]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[495]:
feature importance
1 human_development_index 0.967774
2 gdp_per_capita 0.018507
0 hospital_beds_per_thousand 0.008734
3 population_density 0.003941
4 population 0.001044
In [496]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[496]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [497]:
country1 = 'United Kingdom'
country2 = 'Bulgaria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [498]:
df_updated
Out[498]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13606 United Kingdom 12/26/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13607 United Kingdom 12/27/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13608 United Kingdom 12/28/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13609 United Kingdom 12/29/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564

2090 rows × 10 columns

In [499]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [500]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [501]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[501]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [502]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [503]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [504]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [505]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [506]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[506]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [507]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [508]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [509]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [510]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.944267642996866
In [511]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [512]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.43474289655929
R2 Score: 0.9257898367511572
RMSE: 1.197808
Entropy Value: 0.01142862992057888
In [513]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[513]:
feature importance
1 diabetes_prevalence 0.867049
6 median_age 0.035735
2 female_smokers 0.027527
3 male_smokers 0.020658
4 life_expectancy 0.018178
0 cardiovasc_death_rate 0.016531
5 aged_65_older 0.014323
In [514]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[514]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [515]:
country1 = 'United Kingdom'
country2 = 'Bulgaria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [516]:
df_updated
Out[516]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 2.540 0.932 39753.244 272.898 67508936 0.883564
13606 United Kingdom 12/26/2022 2.540 0.932 39753.244 272.898 67508936 0.883564
13607 United Kingdom 12/27/2022 2.540 0.932 39753.244 272.898 67508936 0.883564
13608 United Kingdom 12/28/2022 2.540 0.932 39753.244 272.898 67508936 0.883564
13609 United Kingdom 12/29/2022 2.540 0.932 39753.244 272.898 67508936 0.883564

2090 rows × 8 columns

In [517]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [518]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [519]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[519]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [520]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [521]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [522]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [523]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [524]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[524]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [525]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [526]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [527]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [528]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9339708697999024
In [529]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [530]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.7342554973835562
R2 Score: 0.9620216134487464
RMSE: 0.856887
Entropy Value: 0.007601507191777842
In [531]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[531]:
feature importance
1 human_development_index 0.881448
2 gdp_per_capita 0.054324
4 population 0.037373
3 population_density 0.025757
0 hospital_beds_per_thousand 0.001098
In [532]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[532]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [533]:
country1 = 'Italy'
country2 = 'Romania'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [534]:
df_updated
Out[534]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
17800 Romania 2/26/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17801 Romania 2/27/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17802 Romania 2/28/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17803 Romania 2/29/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17804 Romania 3/1/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2102 rows × 10 columns

In [535]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [536]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [537]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[537]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [538]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [539]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [540]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [541]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [542]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[542]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [543]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [544]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [545]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [546]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9987132285641792
In [547]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [548]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009270441753640613
R2 Score: 0.9991156527114821
RMSE: 0.096283
Entropy Value: 0.00027465585171992864
In [549]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[549]:
feature importance
1 diabetes_prevalence 0.531054
0 cardiovasc_death_rate 0.434367
5 aged_65_older 0.018956
2 female_smokers 0.010941
3 male_smokers 0.002241
6 median_age 0.002159
4 life_expectancy 0.000282
In [550]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[550]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [551]:
country1 = 'Italy'
country2 = 'Romania'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [552]:
df_updated
Out[552]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
17800 Romania 2/26/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
17801 Romania 2/27/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
17802 Romania 2/28/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
17803 Romania 2/29/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
17804 Romania 3/1/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.180 0.892 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.180 0.892 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.180 0.892 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.180 0.892 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.180 0.892 35220.084 205.859 59037472 0.735109

2102 rows × 8 columns

In [553]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [554]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [555]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[555]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [556]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [557]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [558]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [559]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [560]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[560]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [561]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [562]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [563]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [564]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998323341861008
In [565]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [566]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011774878731579577
R2 Score: 0.9988767437026601
RMSE: 0.108512
Entropy Value: 0.0002876750864884934
In [567]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[567]:
feature importance
1 human_development_index 0.966681
2 gdp_per_capita 0.025068
0 hospital_beds_per_thousand 0.003943
3 population_density 0.003464
4 population 0.000845
In [568]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[568]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [569]:
country1 = 'Spain'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [570]:
df_updated
Out[570]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
24074 Spain 2/1/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24075 Spain 2/2/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24076 Spain 2/3/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24077 Spain 2/4/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24078 Spain 2/5/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [571]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [572]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [573]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[573]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [574]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [575]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [576]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [577]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [578]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[578]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [579]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [580]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [581]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [582]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975684860546954
In [583]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [584]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.09278060807270389
R2 Score: 0.9843829874630111
RMSE: 0.304599
Entropy Value: 0.0033059645967402937
In [585]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[585]:
feature importance
1 diabetes_prevalence 0.910893
0 cardiovasc_death_rate 0.056002
2 female_smokers 0.019922
5 aged_65_older 0.005333
6 median_age 0.005246
3 male_smokers 0.002155
4 life_expectancy 0.000449
In [586]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[586]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [587]:
country1 = 'Spain'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [588]:
df_updated
Out[588]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
24074 Spain 2/1/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
24075 Spain 2/2/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
24076 Spain 2/3/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
24077 Spain 2/4/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
24078 Spain 2/5/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 54225.446 35.608 338289856 1.084791

2136 rows × 8 columns

In [589]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [590]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [591]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[591]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [592]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [593]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [594]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [595]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [596]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[596]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [597]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [598]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [599]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [600]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974620993069001
In [601]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [602]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06762065995368763
R2 Score: 0.988617958901189
RMSE: 0.260040
Entropy Value: 0.0024049365914767576
In [603]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[603]:
feature importance
1 human_development_index 0.959159
2 gdp_per_capita 0.027860
0 hospital_beds_per_thousand 0.008511
3 population_density 0.003790
4 population 0.000679
In [7]:
# Country Pair by Pair Analysis relative to gdp_per_capita
In [8]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[8]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [9]:
# Showing the pairings of countries based on gdp_per_capita (13 pairs of countries)
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedStates = df[(df.location == "United States")]

df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]

df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]

df_Iceland = df[(df.location == "Iceland")]
df_Italy = df[(df.location == "Italy")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]

df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]

df_Portugal = df[(df.location == "Portugal")]
df_Romania = df[(df.location == "Romania")]

df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
In [10]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [11]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [12]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[12]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [13]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [14]:
df_updated
Out[14]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388

2076 rows × 10 columns

In [15]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [16]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [17]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[17]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [18]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [19]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [20]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [21]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [22]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[22]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [23]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [24]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [25]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [26]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9981832621613457
In [27]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [28]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002099363526029487
R2 Score: 0.9990806391844732
RMSE: 0.045819
Entropy Value: 0.0003826774935355706
In [29]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[29]:
feature importance
5 aged_65_older 0.508703
0 cardiovasc_death_rate 0.420926
2 female_smokers 0.028547
6 median_age 0.024134
1 diabetes_prevalence 0.015762
3 male_smokers 0.001625
4 life_expectancy 0.000304
In [30]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[30]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [31]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [32]:
df_updated
Out[32]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 231.447 647601 0.000000
... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 69.874 5023108 0.491388

2076 rows × 8 columns

In [33]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [34]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [35]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[35]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [36]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [37]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [38]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [39]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [40]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[40]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [41]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [42]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [43]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [44]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9981230977543898
In [45]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [46]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038624036497071725
R2 Score: 0.9983085623212649
RMSE: 0.062148
Entropy Value: 0.0008228738012960621
In [47]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[47]:
feature importance
1 human_development_index 0.920449
2 extreme_poverty 0.043623
0 hospital_beds_per_thousand 0.027483
3 population_density 0.007933
4 population 0.000512
In [48]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[48]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [49]:
country1 = 'Switzerland'
country2 = 'United States'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [50]:
df_updated
Out[50]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2112 rows × 10 columns

In [51]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [52]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [53]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[53]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [54]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [55]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [56]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [57]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [58]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[58]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [59]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [60]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [61]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [62]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.982328888461846
In [63]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [64]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.014574168625324405
R2 Score: 0.9932005000328458
RMSE: 0.120724
Entropy Value: 0.0009401813968172323
In [65]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[65]:
feature importance
1 diabetes_prevalence 0.934124
2 female_smokers 0.035472
6 median_age 0.010925
3 male_smokers 0.006711
5 aged_65_older 0.004865
0 cardiovasc_death_rate 0.004014
4 life_expectancy 0.003889
In [66]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[66]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [67]:
country1 = 'Switzerland'
country2 = 'United States'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [68]:
df_updated
Out[68]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.20 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.20 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.20 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.20 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.20 35.608 338289856 1.084791

2112 rows × 8 columns

In [69]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [70]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [71]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[71]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [72]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [73]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [74]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [75]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [76]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[76]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [77]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [78]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [79]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [80]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best CV score: 0.9792894494169208
In [81]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [82]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.018987374157218375
R2 Score: 0.9911415427337641
RMSE: 0.137795
Entropy Value: 0.0011604753693985666
In [83]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[83]:
feature importance
1 human_development_index 0.942162
2 extreme_poverty 0.041692
3 population_density 0.009401
4 population 0.004204
0 hospital_beds_per_thousand 0.002541
In [84]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[84]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [85]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [86]:
df_updated
Out[86]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2095 Belgium 12/26/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2096 Belgium 12/27/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2097 Belgium 12/28/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2098 Belgium 12/29/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787

2099 rows × 10 columns

In [87]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [88]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [89]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[89]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [90]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [91]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [92]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [93]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [94]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[94]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [95]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [96]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [97]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [98]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985702277201526
In [99]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [100]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.020452761128882797
R2 Score: 0.9982725374621599
RMSE: 0.143013
Entropy Value: 0.0008156180119545589
In [101]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[101]:
feature importance
6 median_age 0.853422
1 diabetes_prevalence 0.089967
0 cardiovasc_death_rate 0.036597
5 aged_65_older 0.011508
3 male_smokers 0.004570
2 female_smokers 0.003805
4 life_expectancy 0.000130
In [102]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[102]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [103]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [104]:
df_updated
Out[104]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.931 0.2 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.931 0.2 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.931 0.2 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.931 0.2 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.931 0.2 375.564 11655923 0.711787

2099 rows × 8 columns

In [105]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [106]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [107]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[107]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [108]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [109]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [110]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [111]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [112]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[112]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [113]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [114]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [115]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [116]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974451044142482
In [117]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [118]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010069518672884694
R2 Score: 0.9991495174577224
RMSE: 0.100347
Entropy Value: 0.00034019584039697294
In [119]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[119]:
feature importance
1 human_development_index 0.935823
2 extreme_poverty 0.050241
0 hospital_beds_per_thousand 0.009467
3 population_density 0.003676
4 population 0.000793
In [120]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[120]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [121]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [122]:
df_updated
Out[122]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2134 rows × 10 columns

In [123]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [124]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [125]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[125]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [126]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [127]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [128]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [129]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [130]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[130]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [131]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [132]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [133]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [134]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984540951428933
In [135]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [136]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002748350813295417
R2 Score: 0.9993440760119627
RMSE: 0.052425
Entropy Value: 0.00028602597187348704
In [137]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[137]:
feature importance
1 diabetes_prevalence 0.742827
0 cardiovasc_death_rate 0.187586
6 median_age 0.034101
2 female_smokers 0.019551
5 aged_65_older 0.014214
3 male_smokers 0.001588
4 life_expectancy 0.000133
In [138]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[138]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [139]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [140]:
df_updated
Out[140]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
5187 Denmark 2/2/2020 2.5 0.940 0.2 136.520 5882259 0.000000
5188 Denmark 2/3/2020 2.5 0.940 0.2 136.520 5882259 0.000000
5189 Denmark 2/4/2020 2.5 0.940 0.2 136.520 5882259 0.000000
5190 Denmark 2/5/2020 2.5 0.940 0.2 136.520 5882259 0.000000
5191 Denmark 2/6/2020 2.5 0.940 0.2 136.520 5882259 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.5 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.5 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.5 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.5 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.5 4.037 38454328 1.093162

2134 rows × 8 columns

In [141]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [142]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [143]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[143]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [144]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [145]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [146]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [147]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [148]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[148]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [149]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [150]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [151]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [152]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985212702477563
In [153]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [154]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004149548610657574
R2 Score: 0.9990096648287803
RMSE: 0.064417
Entropy Value: 0.0005668080870825645
In [155]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[155]:
feature importance
1 human_development_index 0.937978
0 hospital_beds_per_thousand 0.035832
2 extreme_poverty 0.021979
3 population_density 0.003812
4 population 0.000400
In [156]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[156]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [157]:
country1 = 'Finland'
country2 = 'France'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [158]:
df_updated
Out[158]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411892

2137 rows × 10 columns

In [159]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [160]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [161]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[161]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [162]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [163]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [164]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [165]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [166]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[166]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [167]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [168]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [169]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [170]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9901279701885508
In [171]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [172]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03485209804982485
R2 Score: 0.9965523687852408
RMSE: 0.186687
Entropy Value: 0.0021807550391466923
In [173]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[173]:
feature importance
1 diabetes_prevalence 0.519750
0 cardiovasc_death_rate 0.429350
2 female_smokers 0.034202
5 aged_65_older 0.006474
3 male_smokers 0.004594
6 median_age 0.004512
4 life_expectancy 0.001119
In [174]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[174]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [175]:
country1 = 'Finland'
country2 = 'France'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [176]:
df_updated
Out[176]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
7310 Finland 1/29/2020 3.28 0.938 0.04 18.136 5540745 0.000000
7311 Finland 1/30/2020 3.28 0.938 0.04 18.136 5540745 0.000000
7312 Finland 1/31/2020 3.28 0.938 0.04 18.136 5540745 0.000000
7313 Finland 2/1/2020 3.28 0.938 0.04 18.136 5540745 0.000000
7314 Finland 2/2/2020 3.28 0.938 0.04 18.136 5540745 0.000000
... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.901 0.02 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.901 0.02 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.901 0.02 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.901 0.02 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.901 0.02 122.578 67813000 0.411892

2137 rows × 8 columns

In [177]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [178]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [179]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[179]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [180]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [181]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [182]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [183]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [184]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[184]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [185]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [186]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [187]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [188]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9910318902554458
In [189]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [190]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.035576227278599215
R2 Score: 0.9964807366404823
RMSE: 0.188617
Entropy Value: 0.0016675150996863651
In [191]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[191]:
feature importance
1 human_development_index 0.937639
2 extreme_poverty 0.039984
0 hospital_beds_per_thousand 0.012847
3 population_density 0.007729
4 population 0.001800
In [192]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[192]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [193]:
country1 = 'Iceland'
country2 = 'Italy'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [194]:
df_updated
Out[194]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
20911 Iceland 2/28/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
20912 Iceland 2/29/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
20913 Iceland 3/1/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
20914 Iceland 3/2/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
20915 Iceland 3/3/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2100 rows × 10 columns

In [195]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [196]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [197]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[197]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [198]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [199]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [200]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [201]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [202]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[202]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [203]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [204]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [205]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [206]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9993936365790189
In [207]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [208]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.02379433705546307
R2 Score: 0.9980488034443954
RMSE: 0.154254
Entropy Value: 0.0014017849110386118
In [209]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[209]:
feature importance
0 cardiovasc_death_rate 0.472670
1 diabetes_prevalence 0.340589
6 median_age 0.162234
2 female_smokers 0.021667
3 male_smokers 0.001448
5 aged_65_older 0.001235
4 life_expectancy 0.000157
In [210]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[210]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [211]:
country1 = 'Iceland'
country2 = 'Italy'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [212]:
df_updated
Out[212]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
20911 Iceland 2/28/2020 2.91 0.949 0.2 3.404 372903 0.000000
20912 Iceland 2/29/2020 2.91 0.949 0.2 3.404 372903 0.000000
20913 Iceland 3/1/2020 2.91 0.949 0.2 3.404 372903 0.000000
20914 Iceland 3/2/2020 2.91 0.949 0.2 3.404 372903 0.000000
20915 Iceland 3/3/2020 2.91 0.949 0.2 3.404 372903 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 205.859 59037472 0.735109

2100 rows × 8 columns

In [213]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [214]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [215]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[215]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [216]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [217]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [218]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [219]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [220]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[220]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [221]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [222]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [223]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [224]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9993675631994823
In [225]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [226]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.021371276450830683
R2 Score: 0.9982475006173722
RMSE: 0.146189
Entropy Value: 0.001247239760700292
In [227]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[227]:
feature importance
1 human_development_index 0.929630
0 hospital_beds_per_thousand 0.045362
2 extreme_poverty 0.023022
3 population_density 0.001775
4 population 0.000212
In [228]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[228]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [229]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [230]:
df_updated
Out[230]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2100 rows × 10 columns

In [231]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [232]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [233]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[233]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [234]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [235]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [236]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [237]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [238]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[238]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [239]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [240]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [241]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [242]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984656768745841
In [243]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [244]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010402460269562753
R2 Score: 0.9990051222850926
RMSE: 0.101992
Entropy Value: 0.00040629309095891516
In [245]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[245]:
feature importance
1 diabetes_prevalence 0.973529
2 female_smokers 0.023056
3 male_smokers 0.001375
0 cardiovasc_death_rate 0.000910
5 aged_65_older 0.000438
4 life_expectancy 0.000347
6 median_age 0.000345
In [246]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[246]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [247]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [248]:
df_updated
Out[248]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 24.718 10549349 0.816005

2100 rows × 8 columns

In [249]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [250]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [251]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[251]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [252]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [253]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [254]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [255]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [256]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[256]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [257]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [258]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [259]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [260]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985072666667036
In [261]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [262]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011437542353413214
R2 Score: 0.9989061283863766
RMSE: 0.106946
Entropy Value: 0.0004613073619798649
In [263]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[263]:
feature importance
1 human_development_index 0.974072
2 extreme_poverty 0.023855
3 population_density 0.001627
4 population 0.000396
0 hospital_beds_per_thousand 0.000050
In [264]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[264]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [265]:
country1 = 'United Kingdom'
country2 = 'Bulgaria'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [266]:
df_updated
Out[266]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13606 United Kingdom 12/26/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13607 United Kingdom 12/27/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13608 United Kingdom 12/28/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13609 United Kingdom 12/29/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564

2090 rows × 10 columns

In [267]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [268]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [269]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[269]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [270]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [271]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [272]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [273]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [274]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[274]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [275]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [276]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [277]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [278]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.944267642996866
In [279]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [280]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.43474289655929
R2 Score: 0.9257898367511572
RMSE: 1.197808
Entropy Value: 0.01142862992057888
In [281]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[281]:
feature importance
1 diabetes_prevalence 0.867049
6 median_age 0.035735
2 female_smokers 0.027527
3 male_smokers 0.020658
4 life_expectancy 0.018178
0 cardiovasc_death_rate 0.016531
5 aged_65_older 0.014323
In [282]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[282]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [283]:
country1 = 'United Kingdom'
country2 = 'Bulgaria'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [284]:
df_updated
Out[284]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 2.540 0.932 0.2 272.898 67508936 0.883564
13606 United Kingdom 12/26/2022 2.540 0.932 0.2 272.898 67508936 0.883564
13607 United Kingdom 12/27/2022 2.540 0.932 0.2 272.898 67508936 0.883564
13608 United Kingdom 12/28/2022 2.540 0.932 0.2 272.898 67508936 0.883564
13609 United Kingdom 12/29/2022 2.540 0.932 0.2 272.898 67508936 0.883564

2090 rows × 8 columns

In [285]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [286]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [287]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[287]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [288]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [289]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [290]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [291]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [292]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[292]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [293]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [294]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [295]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [296]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9339708697999024
In [297]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [298]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.7342554973835562
R2 Score: 0.9620216134487464
RMSE: 0.856887
Entropy Value: 0.007601507191777842
In [299]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[299]:
feature importance
1 human_development_index 0.881448
2 extreme_poverty 0.054324
4 population 0.037373
3 population_density 0.025757
0 hospital_beds_per_thousand 0.001098
In [300]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[300]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [301]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [302]:
df_updated
Out[302]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919575

2061 rows × 10 columns

In [303]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [304]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [305]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[305]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [306]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [307]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [308]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [309]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [310]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[310]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [311]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [312]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [313]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [314]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9931354068766577
In [315]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [316]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00047399699097167984
R2 Score: 0.9991753265149302
RMSE: 0.021771
Entropy Value: 0.00015620474538757492
In [317]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[317]:
feature importance
1 diabetes_prevalence 0.658891
0 cardiovasc_death_rate 0.219210
5 aged_65_older 0.052242
6 median_age 0.044419
2 female_smokers 0.022219
3 male_smokers 0.001522
4 life_expectancy 0.001498
In [318]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[318]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [319]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [320]:
df_updated
Out[320]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 127.657 896007 0.000000
... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.63 0.900 0.00 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.63 0.900 0.00 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.63 0.900 0.00 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.63 0.900 0.00 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.63 0.900 0.00 137.176 10493990 0.919575

2061 rows × 8 columns

In [321]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [322]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [323]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[323]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [324]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [325]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [326]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [327]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [328]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[328]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [329]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [330]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [331]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [332]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9866151912407417
In [333]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [334]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008157893614715191
R2 Score: 0.9858066639953047
RMSE: 0.090321
Entropy Value: 0.002246714925831334
In [335]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[335]:
feature importance
1 human_development_index 0.818042
0 hospital_beds_per_thousand 0.113415
2 extreme_poverty 0.051214
3 population_density 0.013891
4 population 0.003438
In [336]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[336]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [337]:
country1 = 'Estonia'
country2 = 'Latvia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [338]:
df_updated
Out[338]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2099 rows × 10 columns

In [339]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [340]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [341]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[341]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [342]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [343]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [344]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [345]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [346]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[346]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [347]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [348]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [349]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [350]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9979817144366736
In [351]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [352]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0018696016902774828
R2 Score: 0.9968998375395624
RMSE: 0.043239
Entropy Value: 0.0007273351883650379
In [353]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[353]:
feature importance
1 diabetes_prevalence 0.954112
2 female_smokers 0.017664
0 cardiovasc_death_rate 0.012118
5 aged_65_older 0.010583
6 median_age 0.004349
3 male_smokers 0.000941
4 life_expectancy 0.000232
In [354]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[354]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [355]:
country1 = 'Estonia'
country2 = 'Latvia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [356]:
df_updated
Out[356]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.5 31.033 1326064 0.000000
6250 Estonia 1/18/2020 4.69 0.892 0.5 31.033 1326064 0.000000
6251 Estonia 2/5/2020 4.69 0.892 0.5 31.033 1326064 0.000000
6252 Estonia 2/6/2020 4.69 0.892 0.5 31.033 1326064 0.000000
6253 Estonia 2/7/2020 4.69 0.892 0.5 31.033 1326064 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 31.212 1850654 0.631969

2099 rows × 8 columns

In [357]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [358]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [359]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[359]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [360]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [361]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [362]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [363]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [364]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[364]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [365]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [366]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [367]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [368]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975170379181927
In [369]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [370]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002618029320035362
R2 Score: 0.995658799272323
RMSE: 0.051167
Entropy Value: 0.0010107185902514522
In [371]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[371]:
feature importance
1 human_development_index 0.960201
2 extreme_poverty 0.023049
0 hospital_beds_per_thousand 0.014483
3 population_density 0.001800
4 population 0.000466
In [372]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[372]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [373]:
country1 = 'Portugal'
country2 = 'Romania'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [374]:
df_updated
Out[374]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403

2072 rows × 10 columns

In [375]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [376]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [377]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[377]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [378]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [379]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [380]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [381]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [382]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[382]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [383]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [384]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [385]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [386]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9962992681061019
In [387]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [388]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005699053732913187
R2 Score: 0.9966620033088617
RMSE: 0.075492
Entropy Value: 0.00040310281139748264
In [389]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[389]:
feature importance
1 diabetes_prevalence 0.590038
0 cardiovasc_death_rate 0.335347
5 aged_65_older 0.038765
2 female_smokers 0.020746
6 median_age 0.008911
3 male_smokers 0.005290
4 life_expectancy 0.000903
In [390]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[390]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [391]:
country1 = 'Portugal'
country2 = 'Romania'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [392]:
df_updated
Out[392]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
10484 Portugal 3/1/2020 3.390 0.864 0.5 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.390 0.864 0.5 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.390 0.864 0.5 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.390 0.864 0.5 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.390 0.864 0.5 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.7 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.7 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.7 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.7 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.7 85.129 19659270 2.036403

2072 rows × 8 columns

In [393]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [394]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [395]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[395]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [396]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [397]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [398]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [399]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [400]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[400]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [401]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [402]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [403]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [404]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9971895025318556
In [405]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [406]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009693221423902118
R2 Score: 0.9943225765967789
RMSE: 0.098454
Entropy Value: 0.0009866503695753954
In [407]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[407]:
feature importance
1 human_development_index 0.881243
0 hospital_beds_per_thousand 0.078831
2 extreme_poverty 0.029944
3 population_density 0.009145
4 population 0.000837
In [438]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[438]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [439]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [440]:
df_updated
Out[440]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716205

2067 rows × 10 columns

In [441]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [442]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [443]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[443]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [444]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [445]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [446]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [447]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [448]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[448]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [449]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [450]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [451]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [452]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9930228916370709
In [453]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [454]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0009018432472984047
R2 Score: 0.996309224905843
RMSE: 0.030031
Entropy Value: 0.0004574118660922552
In [455]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[455]:
feature importance
1 diabetes_prevalence 0.872557
6 median_age 0.095584
2 female_smokers 0.017262
5 aged_65_older 0.008445
3 male_smokers 0.003182
0 cardiovasc_death_rate 0.001540
4 life_expectancy 0.001430
In [456]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[456]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [457]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [458]:
df_updated
Out[458]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.820 0.860 0.70 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.820 0.860 0.70 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.820 0.860 0.70 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.820 0.860 0.70 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.820 0.860 0.70 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 80.291 6871547 0.716205

2067 rows × 8 columns

In [459]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [460]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [461]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[461]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [462]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [463]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [464]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [465]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [466]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[466]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [467]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [468]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [469]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [470]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9925952418557594
In [471]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [472]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0008275646091810585
R2 Score: 0.9966132087172344
RMSE: 0.028767
Entropy Value: 0.000479760324669255
In [473]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[473]:
feature importance
1 human_development_index 0.911824
0 hospital_beds_per_thousand 0.045153
2 extreme_poverty 0.034897
3 population_density 0.006234
4 population 0.001892
In [474]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[474]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [475]:
country1 = 'Slovenia'
country2 = 'Spain'

# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [476]:
df_updated
Out[476]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
24074 Spain 2/1/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24075 Spain 2/2/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24076 Spain 2/3/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24077 Spain 2/4/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24078 Spain 2/5/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2125 rows × 10 columns

In [477]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [478]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [479]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[479]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [480]:
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7  # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [481]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [482]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [483]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [484]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[484]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [485]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [486]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [487]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [488]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985269601145046
In [489]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [490]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004771901735090741
R2 Score: 0.9992681453869718
RMSE: 0.069079
Entropy Value: 0.00029331989492224016
In [491]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[491]:
feature importance
1 diabetes_prevalence 0.895143
0 cardiovasc_death_rate 0.074364
2 female_smokers 0.019122
6 median_age 0.004778
3 male_smokers 0.003564
5 aged_65_older 0.002714
4 life_expectancy 0.000316
In [492]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[492]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [493]:
country1 = 'Slovenia'
country2 = 'Spain'

# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [494]:
df_updated
Out[494]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
24074 Spain 2/1/2020 2.97 0.904 1.0 93.105 47558632 0.000000
24075 Spain 2/2/2020 2.97 0.904 1.0 93.105 47558632 0.000000
24076 Spain 2/3/2020 2.97 0.904 1.0 93.105 47558632 0.000000
24077 Spain 2/4/2020 2.97 0.904 1.0 93.105 47558632 0.000000
24078 Spain 2/5/2020 2.97 0.904 1.0 93.105 47558632 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 102.619 2119843 0.536669

2125 rows × 8 columns

In [495]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [496]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [497]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[497]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [498]:
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5  # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [499]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [500]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [501]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [502]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[502]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [503]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [504]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [505]:
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
In [506]:
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9982082361713068
In [507]:
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                       max_depth=grid_search.best_params_['max_depth'],
                                       min_samples_split=grid_search.best_params_['min_samples_split'],
                                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

y_pred = best_rf_model.predict(X_test_scaled)
In [508]:
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006855328683786699
R2 Score: 0.9989486154159127
RMSE: 0.082797
Entropy Value: 0.00046734218153940193
In [509]:
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[509]:
feature importance
1 human_development_index 0.961494
2 extreme_poverty 0.024439
0 hospital_beds_per_thousand 0.009664
3 population_density 0.004009
4 population 0.000394
In [ ]: